src/embedding-server-manager.lua

172 lines

1#!/usr/bin/env lua

3-- {{{ embedding-server-manager.lua

4-- Issue 10-049: Embedding-server manager. Originally written for Ollama

5-- (10-005); reimplemented for llama.cpp. Provides liveness checks against

6-- the configured inference server, can launch start-llamacpp-server.sh

7-- when the server is down, and runs an end-to-end test embedding request.

8--

9-- Public API (preserved across the migration so call sites do not need

10-- to change shape):

11-- M.ensure_ready() — return the endpoint URL once the server is up

12-- M.test_embedding(ep, model) — send one /v1/embeddings request, verify shape

13--

14-- The big shape changes from the Ollama era:

15-- - /api/tags → /v1/models (OpenAI-compatible liveness)

16-- - /api/embeddings → /v1/embeddings (OpenAI-compatible inference)

17-- - "prompt" body → "input" body (OpenAI request shape)

18-- - response.embedding → response.data[1].embedding (OpenAI response shape)

19-- - "ollama pull" → (removed) (llama.cpp has the model on disk

20-- already; nothing to pull)

21-- - "ollama serve" → delegate to scripts/start-llamacpp-server.sh

22-- }}}

24-- {{{ local function setup_dir_path

25local function setup_dir_path(provided_dir)

26 if provided_dir then

27 return provided_dir

28 end

29 return "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

30end

31-- }}}

33local DIR = setup_dir_path(arg and arg[1])

34package.path = DIR .. "/libs/?.lua;" .. package.path

35local inference_config = require("inference-server-config")

37local M = {}

39-- {{{ local function is_server_running

40-- Pings /v1/models to verify the server is alive. /v1/models is llama.cpp's

41-- OpenAI-compatible "what's loaded" endpoint — returns 200 with a JSON list

42-- of one model when the server is healthy. We don't parse the list (the

43-- consumer's job); exit code 0 from curl is enough to say "alive".

44local function is_server_running(endpoint)

45 local cmd = "curl -s --max-time 2 " .. endpoint .. "/v1/models > /dev/null 2>&1"

46 local result = os.execute(cmd)

47 return result == 0 or result == true

48end

49-- }}}

51-- {{{ local function start_llamacpp_service

52-- Invokes scripts/start-llamacpp-server.sh, which handles its own env setup

53-- (libs/cuda on LD_LIBRARY_PATH), config resolution, and liveness wait.

54-- Returning exit code 0 from the script means the server is up; we don't

55-- need to wait again here.

56local function start_llamacpp_service()

57 print("Starting llama.cpp embedding server via scripts/start-llamacpp-server.sh...")

58 local cmd = string.format('"%s/scripts/start-llamacpp-server.sh" "%s"', DIR, DIR)

59 local result = os.execute(cmd)

60 return result == 0 or result == true

61end

62-- }}}

64-- {{{ function M.ensure_ready

65-- Resolves the configured inference server, checks if it's running, starts

66-- it if not. Returns the endpoint URL on success, nil on failure. The

67-- public name dropped the "ollama" prefix in 10-049; callers update their

68-- call site in the same atomic batch as the require rename.

69function M.ensure_ready()

70 local endpoint = inference_config.build_host_url()

71 print("=== Inference Server Manager ===")

72 print("Target endpoint: " .. endpoint)

73 print("")

75 print("Checking if the inference server is running...")

76 if is_server_running(endpoint) then

77 print("✓ Server is already running at " .. endpoint)

78 return endpoint

79 end

81 print("✗ Server not running — attempting to start it")

82 if start_llamacpp_service() and is_server_running(endpoint) then

83 print("✓ Server is now ready at " .. endpoint)

84 return endpoint

85 end

87 print("✗ Failed to start the inference server")

88 print(" HINT: run ./scripts/start-llamacpp-server.sh manually for verbose output")

89 return nil

90end

91-- }}}

93-- {{{ function M.test_embedding

94-- Sends a single test embedding request to /v1/embeddings (OpenAI shape)

95-- and verifies the response body contains a "data" array with an

96-- "embedding" key. We don't strictly validate the vector dimensions —

97-- that's the consumer's job — but the substring check catches the

98-- obvious "server started but doesn't actually serve embeddings" failure.

99function M.test_embedding(endpoint, model)

100 print("Testing embedding generation...")

101

102 os.execute(string.format('"%s/scripts/ensure-tmp-symlink" "%s"', DIR, DIR))

103 local result_path = DIR .. "/tmp/embedding_test.json"

104

105 local test_cmd = string.format(

106 "curl -s -X POST %s/v1/embeddings -H 'Content-Type: application/json' " ..

107 "-d '{\"model\": \"%s\", \"input\": \"test embedding\"}' > %s",

108 endpoint, model, result_path

109 )

110 os.execute(test_cmd)

111

112 local result_file = io.open(result_path, "r")

113 if not result_file then

114 print("✗ Failed to read test response")

115 return false

116 end

117 local content = result_file:read("*a")

118 result_file:close()

119

120 if content:find('"data"', 1, true) and content:find('"embedding"', 1, true) then

121 print("✓ Embedding generation test passed")

122 return true

123 end

124 print("✗ Embedding generation test failed")

125 print("Response: " .. content)

126 return false

127end

128-- }}}

129

130-- {{{ function M.main

131-- Interactive entry point for running the manager standalone.

132function M.main(interactive_mode)

133 if interactive_mode then

134 print("=== Embedding Server Manager (interactive) ===")

135 print("1. Ensure server is running, then test (default config)")

136 print("2. Test embedding generation only")

137 io.write("Select option (1-2): ")

138 local choice = io.read()

139

140 if choice == "1" or choice == "" then

141 local endpoint = M.ensure_ready()

142 if endpoint then

143 M.test_embedding(endpoint, inference_config.get_selected_model())

144 end

145 elseif choice == "2" then

146 local endpoint = inference_config.build_host_url()

147 M.test_embedding(endpoint, inference_config.get_selected_model())

148 else

149 print("Invalid choice")

150 end

151 else

152 local endpoint = M.ensure_ready()

153 if endpoint then

154 M.test_embedding(endpoint, inference_config.get_selected_model())

155 end

156 end

157end

158-- }}}

159

160if arg then

161 local interactive_mode = false

162 for _, arg_val in ipairs(arg) do

163 if arg_val == "-I" then

164 interactive_mode = true

165 break

166 end

167 end

168 M.main(interactive_mode)

169end

170

171return M

172