scripts/precompute-diversity-sequences-gpu

#!/usr/bin/env bash

Wrapper script to run GPU diversity cache generation

Changes to vulkan-compute directory for shader loading, then runs Lua script

{{{ Setup directory path

DIR="${1:-/mnt/mtwo/programming/ai-stuff/neocities-modernization}"

Issue 10-057: the GPU shader paths (vk_diversity.c) and library path (vk_compute.lua)

are now project-root-relative / DIR-based, matching similarity -- so we run from the

project root, no longer cd-ing into libs/vulkan-compute/ for shader loading.

cd "$DIR" || {
echo "Error: Failed to change to project directory: $DIR" >&2
exit 1
}

Export DIR so Lua can access it

export DIR

Ensure the on-disk diversity-cache directory exists BEFORE the (~40-minute) GPU

run, inferring it from the selected model via scripts/cache-dir --disk (the one

place that maps a model name to its directory). run.sh already creates this at

model-load; this keeps the wrapper safe when invoked on its own. Without it a

brand-new model would compute for an hour and only then fail at the final write.

_disk_dir="$(luajit "$DIR/scripts/cache-dir" "$DIR" --disk)"
[ -n "$_disk_dir" ] && mkdir -p "$_disk_dir"

Run the Lua script

exec luajit -e '
-- Pre-computes diversity sequences using GPU acceleration (Vulkan compute shaders)
-- Replaces CPU-based effil implementation with 2,600× faster GPU batch processing
-- Output: assets/embeddings/embeddinggemma_latest/diversity_cache.json
--
-- GPU ARCHITECTURE (Issue 9-005):
-- - GPU computes sequences and returns Lua tables
-- - CPU formats tables as JSON and writes to disk
-- - Separation of concerns: GPU = compute, CPU = format/persist
--
-- Performance: ~58 seconds for 7,797 sequences (vs ~42 hours on CPU)

-- Get DIR from environment (set by wrapper script)
local DIR = os.getenv("DIR") or arg[1] or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

-- Add library paths
package.path = DIR .. "/libs/?.lua;" .. DIR .. "/libs/vulkan-compute/lua/?.lua;" .. DIR .. "/src/?.lua;" .. package.path

-- Issue 10-057: VK_COMPUTE_LIB no longer set here -- vk_compute.lua defaults to the
-- DIR-based absolute library path, so it loads regardless of working directory.

local ffi = require("ffi")
local vk = require("vk_compute")
local utils = require("utils")
local dkjson = require("dkjson")
-- Wall-clock time so the "Total Time" / "sequences/sec" lines reflect
-- real seconds rather than the CPU time the host thread happened to use
-- (which is near zero while the GPU is doing the work).
local socket = require("socket")
local wall_clock = socket.gettime

-- Configuration
local BATCH_SIZE = 3584 -- Optimal for GTX 1080 Ti (3,584 CUDA cores)
-- The model determines the embeddings directory we read from. It is resolved
-- through inference-server-config, which reads the overrides notepad for this
-- run (tmp/run-overrides.lua, written by run.sh from --model) and falls back to
-- config.lua when no override was set -- so this wrapper agrees with every other
-- stage and still works when invoked directly without run.sh.
--
-- NOTE: this whole chunk runs inside a bash luajit -e (single-quoted)
-- string, so an apostrophe in any comment here would terminate that
-- string early and feed luajit a bogus script name (it once tried to
-- open a file literally named "selected"). Keep these comments
-- apostrophe-free, or move this program into its own .lua file.
local inference_config = require("inference-server-config")
inference_config.set_project_root(DIR)
local MODEL_NAME = inference_config.get_selected_model()

-- Issue 10-057: cap each diversity sequence to the top-K poems any page set can show
-- (K = pages x poems_per_page, resolved from the run env or config -- the same formula
-- the similarity cache uses). The walk is built front-to-back, so the first K are
-- exactly what the diversity pages display; storing the full walk only bloats the JSON
-- on disk and the RAM the HTML stage parses it back into. Apostrophe-free (see note).
local TOP_K = 0
do
local config_loader = require("config-loader")
config_loader.set_project_root(DIR)
local pag = config_loader.load().pagination
if not pag then error("config.pagination missing; cannot size the diversity cache") end
local pages = tonumber(os.getenv("PAGES")) or pag.minimum_pages
local per_page = tonumber(os.getenv("POEMS_PER_PAGE")) or pag.poems_per_page
if not pages or not per_page then error("cannot resolve pages/poems_per_page for diversity cache") end
TOP_K = pages * per_page
print(string.format("[Config] Capping each sequence to top-%d (K = %d pages x %d per page)", TOP_K, pages, per_page))
end

-- Parse command line arguments
local SAVE_BINARY = false -- Optional: save binary format for debug
for i = 2, #arg do
local a = arg[i]
if a == "--save-binary" then
SAVE_BINARY = true
end
end

-- {{{ local function get_file_size
local function get_file_size(filepath)
local f = io.open(filepath, "rb")
if not f then return 0 end
local size = f:seek("end")
f:close()
return size
end
-- }}}

-- {{{ local function main
local function main()
print("=" .. string.rep("=", 78))
print(" GPU Diversity Cache Generation")
print("=" .. string.rep("=", 78))
print()

-- Build paths
local model_dir = MODEL_NAME:gsub(":", "_")
-- Issue 10-054: read embeddings from the movable cache (RAM), but write the
-- diversity cache to disk (embeddings_dir_disk) so it survives a reboot.
local embeddings_file = utils.embeddings_dir(MODEL_NAME) .. "/embeddings.json"
local output_file = utils.embeddings_dir_disk(MODEL_NAME) .. "/diversity_cache.json"
-- The on-disk output directory is created by the wrapper (bash, above) and by
-- run.sh at model-load, so it exists by the time we write here.
local binary_file = string.format("%s/output/diversity-cache-gpu-batch.bin", DIR)

print(string.format("[Config] Model: %s", MODEL_NAME))
print(string.format("[Config] Batch Size: %d sequences", BATCH_SIZE))
print(string.format("[Config] Input: %s", embeddings_file))
print(string.format("[Config] Output: %s", output_file))
if SAVE_BINARY then
print(string.format("[Config] Binary Output: %s", binary_file))
end
print()

-- Check if embeddings file exists
if not utils.file_exists(embeddings_file) then
error(string.format("Embeddings file not found: %s\nRun: ./run.sh --generate-embeddings", embeddings_file))
end

-- The GPU shader reads embeddings as FP16-packed uints. We cache a
-- binary FP16 file next to embeddings.json so subsequent runs skip
-- the (slow) JSON parse and FP32->FP16 conversion. The binary is
-- considered fresh if its mtime is at least as new as embeddings.json.
local fp16_file = embeddings_file:gsub("%.json$", "_fp16.bin")
local function file_mtime(path)
local f = io.open(path, "rb")
if not f then return nil end
f:close()
local cmd = string.format("stat -c %%Y %q 2>/dev/null", path)
local h = io.popen(cmd)
if not h then return nil end
local s = h:read("*a")
h:close()
return tonumber(s)
end

local fp16_mtime = file_mtime(fp16_file)
local json_mtime = file_mtime(embeddings_file)
local fp16_fresh = fp16_mtime and json_mtime and fp16_mtime >= json_mtime

local num_poems, embedding_dim
local embeddings_fp16 -- FFI uint16_t[?] buffer, fed to the GPU

if fp16_fresh then
-- We still need the metadata header (num_poems / embedding_dim).
-- Reading just the metadata block from JSON would be cleaner but
-- requires a streaming parser; for now we read the whole JSON
-- just for the metadata. This is the only cost of the slow path.
print("[Loading] Reading embeddings metadata...")
local embeddings_data = utils.read_json_file(embeddings_file)
if not embeddings_data or not embeddings_data.embeddings then
error("Failed to load embeddings or invalid format")
end
num_poems = #embeddings_data.embeddings
embedding_dim = embeddings_data.metadata and embeddings_data.metadata.embedding_dimension or 768
embeddings_data = nil -- let the GC reclaim the table
collectgarbage("collect")

print(string.format("[Loading] Loaded %d poems × %d dimensions (FP16 cache hit)", num_poems, embedding_dim))
print(string.format("[Loading] Reading FP16 cache: %s", fp16_file))
local total_values = num_poems * embedding_dim
embeddings_fp16 = ffi.new("uint16_t[?]", total_values)
local f = io.open(fp16_file, "rb")
if not f then error("Failed to open FP16 cache: " .. fp16_file) end
local data = f:read("*a")
f:close()
if #data ~= total_values * 2 then
error(string.format(
"FP16 cache size mismatch: expected %d bytes, got %d. Delete %s and retry.",
total_values * 2, #data, fp16_file))
end
ffi.copy(embeddings_fp16, data, total_values * 2)
print(string.format("[Loading] Loaded %d FP16 values (%.2f MB)",
total_values, total_values * 2 / (1024 * 1024)))
else
-- Cold path: read FP32 JSON, convert to FP16, write the cache,
-- then load the resulting FP16 buffer into memory.
print("[Loading] Reading embeddings (FP32 JSON)...")
local embeddings_data = utils.read_json_file(embeddings_file)
if not embeddings_data or not embeddings_data.embeddings then
error("Failed to load embeddings or invalid format")
end
num_poems = #embeddings_data.embeddings
embedding_dim = embeddings_data.metadata and embeddings_data.metadata.embedding_dimension or 768
print(string.format("[Loading] Loaded %d poems × %d dimensions", num_poems, embedding_dim))

if embedding_dim % 2 ~= 0 then
error(string.format(
"embedding_dim must be even for FP16-packed shader; got %d. " ..
"Pad the embeddings or use a model with an even dimension.",
embedding_dim))
end

local total_values = num_poems * embedding_dim
print(string.format("[Loading] Converting %d FP32 values to FP16...", total_values))

-- Stage FP32 values into a flat FFI buffer (much faster than
-- a Lua table because each element is a typed write, not a
-- tagged-value allocation).
local fp32_buf = ffi.new("float[?]", total_values)
local idx = 0
for _, poem in ipairs(embeddings_data.embeddings) do
if not poem.embedding or #poem.embedding ~= embedding_dim then
error(string.format("Invalid embedding for poem_index %d", poem.poem_index))
end
for _, value in ipairs(poem.embedding) do
fp32_buf[idx] = value
idx = idx + 1
end
end
embeddings_data = nil
collectgarbage("collect")

-- Bulk-convert FP32 -> FP16 via the C helper. About 20M floats
-- in a tight C loop is essentially instantaneous (~50 ms) vs.
-- pure-Lua bit ops which would be ~minutes.
embeddings_fp16 = ffi.new("uint16_t[?]", total_values)
vk.fp32_to_fp16(fp32_buf, embeddings_fp16, total_values)
fp32_buf = nil
collectgarbage("collect")

-- Persist the FP16 cache. Subsequent runs hit the fast path above.
print(string.format("[Loading] Writing FP16 cache: %s", fp16_file))
local f = io.open(fp16_file, "wb")
if not f then error("Failed to open FP16 cache for writing: " .. fp16_file) end
f:write(ffi.string(embeddings_fp16, total_values * 2))
f:close()
print(string.format("[Loading] Wrote %d FP16 values (%.2f MB)",
total_values, total_values * 2 / (1024 * 1024)))
end
print()

-- Initialize GPU
print("[GPU] Initializing Vulkan compute context...")
local ctx = vk.init()
if not ctx then
error("Failed to initialize GPU context")
end
print()

-- Issue 10-057: clamp BATCH_SIZE to free GPU VRAM before the batched dispatch. On a
-- roomy card this keeps the configured size; it is the guard rail for a much larger
-- corpus or a smaller GPU. fixed = the resident FP16 embeddings; per-sequence = the
-- output array plus the per-sequence centroid/mask working set (approximate -- the
-- budgeter only needs the right order of magnitude). Wrapped in pcall: this is an
-- enhancement, so a VRAM-probe failure (e.g. no nvidia-smi) must NOT break the run --
-- it just keeps the configured BATCH_SIZE and says so.
do
local ok, result = pcall(function()
local budget = require("memory-budgeter")
local fixed_vram = num_poems * embedding_dim * 2
local per_seq = num_poems * 5 + embedding_dim * 4
return budget.fit_threads({
pool = "vram", fixed = fixed_vram, per_thread = per_seq,
want = BATCH_SIZE, label = "diversity",
})
end)
if ok and result then
BATCH_SIZE = result
else
print("[GPU] VRAM budget check skipped (" .. tostring(result)
.. "); using configured BATCH_SIZE " .. BATCH_SIZE)
end
end

-- Compute diversity sequences on GPU
local start_time = wall_clock()
print("[GPU] Computing diversity sequences (batch parallel processing)...")
-- One diversity sequence is produced per poem. The GPU processes them
-- BATCH_SIZE sequences at a time, so the run is split into this many
-- batches (the last one is partial). Reported so the reader knows how the
-- work is chunked -- distinct from the 3 compute shaders the context holds.
local num_batches = math.ceil(num_poems / BATCH_SIZE)
print(string.format("[GPU] %d sequences in %d batches of up to %d each", num_poems, num_batches, BATCH_SIZE))
print(string.format("[GPU] This will take approximately 1 minute for %d poems", num_poems))
print()

-- Call GPU batch function (returns Lua tables, no file writing).
-- Embeddings are now an FFI uint16_t buffer (FP16-packed) — the
-- compute function passes it straight through to the C side.
local sequences = vk.compute_all_diversity_sequences_batched(
ctx,
embeddings_fp16,
num_poems,
embedding_dim,
SAVE_BINARY and binary_file or nil, -- Optional binary output
BATCH_SIZE
)

local elapsed = wall_clock() - start_time
print()
print(string.format("[GPU] ✅ Completed in %.2f seconds (%.2f sequences/sec)", elapsed, num_poems / elapsed))
print()

-- Shutdown GPU
vk.shutdown(ctx)

-- Format sequences as JSON (CPU responsibility)
print("[CPU] Formatting sequences as JSON...")
local cache = {
metadata = {
model = MODEL_NAME,
num_poems = num_poems,
embedding_dimension = embedding_dim,
algorithm = "gpu_vulkan_batch",
batch_size = BATCH_SIZE,
generated_at = os.date("%Y-%m-%d %H:%M:%S"),
generation_time_seconds = math.floor(elapsed),
embeddings_file_size = get_file_size(embeddings_file),
total_sequences = num_poems,
top_k = TOP_K
},
sequences = {}
}

-- Convert 0-indexed Lua tables to 1-indexed JSON strings
-- GPU returns: {[0] = {seq}, [1] = {seq}, ...}
-- JSON needs: {"0": [seq], "1": [seq], ...}
for poem_id = 0, num_poems - 1 do
local sequence = sequences[poem_id]
if not sequence then
error(string.format("Missing sequence for poem_id %d", poem_id))
end
-- Issue 10-057: keep only the first K of the diversity walk (the top-K).
if TOP_K > 0 and #sequence > TOP_K then
local capped = {}
for i = 1, TOP_K do capped[i] = sequence[i] end
sequence = capped
end
cache.sequences[tostring(poem_id)] = sequence
end

print(string.format("[CPU] Formatted %d sequences", num_poems))
print()

-- Write JSON to disk
print(string.format("[CPU] Writing JSON to: %s", output_file))
local json_str = dkjson.encode(cache, {indent = false})
local f = io.open(output_file, "w")
if not f then
error("Failed to open output file: " .. output_file)
end
f:write(json_str)
f:close()

local output_size = get_file_size(output_file)
print(string.format("[CPU] ✅ Written %.2f MB", output_size / 1024 / 1024))
print()

print("=" .. string.rep("=", 78))
print(" SUCCESS!")
print("=" .. string.rep("=", 78))
print(string.format(" Total Time: %.2f seconds", elapsed))
print(string.format(" Output: %s", output_file))
print(string.format(" Speedup: ~2,600× faster than CPU implementation"))
print("=" .. string.rep("=", 78))
print()
end
-- }}}

-- Run with error handling
local success, err = pcall(main)
if not success then
print("\n[ERROR] " .. tostring(err))
os.exit(1)
end
'