libs/triangular-similarity-access.lua

200 lines

1-- Triangular Similarity Access Utility

2-- Provides transparent access to triangular individual similarity files

3-- Handles symmetric lookup: similarity(A,B) = similarity(B,A)

4-- Storage format: each file contains only pairs where this_id < other_id

6local utils = require('utils')

7local dkjson = require('dkjson')

9local M = {}

11-- File cache to avoid repeated I/O during batch operations

12local file_cache = {}

13local MAX_CACHE_SIZE = 100

14local cache_hits = 0

15local cache_misses = 0

17-- {{{ function M.get_similarity

18-- @param poem_a: first poem ID

19-- @param poem_b: second poem ID

20-- @param embeddings_dir: optional directory path (default: derived from the currently selected inference model via inference-server-config)

21-- @return similarity score (0.0 to 1.0)

22function M.get_similarity(poem_a, poem_b, embeddings_dir)

23 -- Handle self-similarity

24 if poem_a == poem_b then

25 return 1.0

26 end

28 -- Ensure triangular ordering (min_id < max_id)

29 local min_id = math.min(tonumber(poem_a), tonumber(poem_b))

30 local max_id = math.max(tonumber(poem_a), tonumber(poem_b))

32 -- Default embeddings directory

33 embeddings_dir = embeddings_dir or require("inference-server-config").get_selected_model():gsub("[^%w%-_.]", "_")

35 -- Build file path for the smaller ID

36 -- Issue 10-054: similarity files are movable (embeddings_dir, RAM).

37 local file_path = utils.similarities_dir(embeddings_dir) .. string.format("/poem_%d.json", min_id)

39 -- Load file (with caching disabled for thread safety)

40 local file_data = utils.read_json_file(file_path)

41 if not file_data or not file_data.similarities then

42 return 0.0

43 end

45 -- Search for the larger ID in the similarities array

46 for _, entry in ipairs(file_data.similarities) do

47 if tonumber(entry.id) == max_id then

48 return entry.similarity

49 end

50 end

52 -- Not found (shouldn't happen with complete data)

53 return 0.0

54end

55-- }}}

57-- {{{ function M.get_similarity_cached

58-- Cached version for batch operations (NOT thread-safe)

59-- @param poem_a: first poem ID

60-- @param poem_b: second poem ID

61-- @param embeddings_dir: optional directory path

62-- @return similarity score (0.0 to 1.0)

63function M.get_similarity_cached(poem_a, poem_b, embeddings_dir)

64 if poem_a == poem_b then

65 return 1.0

66 end

68 local min_id = math.min(tonumber(poem_a), tonumber(poem_b))

69 local max_id = math.max(tonumber(poem_a), tonumber(poem_b))

71 embeddings_dir = embeddings_dir or require("inference-server-config").get_selected_model():gsub("[^%w%-_.]", "_")

73 -- Check cache first

74 local cache_key = string.format("%s:%d", embeddings_dir, min_id)

76 if not file_cache[cache_key] then

77 -- Cache miss - load file

78 cache_misses = cache_misses + 1

80 local file_path = utils.similarities_dir(embeddings_dir) .. string.format("/poem_%d.json", min_id)

82 file_cache[cache_key] = utils.read_json_file(file_path)

84 -- Evict oldest entry if cache full (simple FIFO)

85 local cache_size = 0

86 for _ in pairs(file_cache) do cache_size = cache_size + 1 end

88 if cache_size > MAX_CACHE_SIZE then

89 local oldest_key = next(file_cache)

90 file_cache[oldest_key] = nil

91 end

92 else

93 cache_hits = cache_hits + 1

94 end

96 local file_data = file_cache[cache_key]

97 if not file_data or not file_data.similarities then

98 return 0.0

99 end

100

101 -- Search for similarity

102 for _, entry in ipairs(file_data.similarities) do

103 if tonumber(entry.id) == max_id then

104 return entry.similarity

105 end

106 end

107

108 return 0.0

109end

110-- }}}

111

112-- {{{ function M.get_all_similarities_for_poem

113-- Get all similarities for a specific poem (collects from triangular storage)

114-- This requires reading multiple files:

115-- 1. This poem's file (for higher IDs)

116-- 2. All lower-ID files that may reference this poem

117-- @param poem_id: the poem to get similarities for

118-- @param all_poem_ids: list of all valid poem IDs in corpus

119-- @param embeddings_dir: optional directory path

120-- @return array of {id, similarity} sorted by similarity (descending)

121function M.get_all_similarities_for_poem(poem_id, all_poem_ids, embeddings_dir)

122 poem_id = tonumber(poem_id)

123 embeddings_dir = embeddings_dir or require("inference-server-config").get_selected_model():gsub("[^%w%-_.]", "_")

124

125 local similarities = {}

126

127 -- 1. Load this poem's file (contains similarities to higher IDs)

128 local my_file_path = utils.similarities_dir(embeddings_dir) .. string.format("/poem_%d.json", poem_id)

129

130 local my_file = utils.read_json_file(my_file_path)

131 if my_file and my_file.similarities then

132 for _, entry in ipairs(my_file.similarities) do

133 table.insert(similarities, {

134 id = tonumber(entry.id),

135 similarity = entry.similarity

136 })

137 end

138 end

139

140 -- 2. Check all lower-ID files for references to this poem

141 for _, other_id in ipairs(all_poem_ids) do

142 other_id = tonumber(other_id)

143 if other_id < poem_id then

144 local their_file_path = utils.similarities_dir(embeddings_dir) .. string.format("/poem_%d.json", other_id)

145

146 local their_file = utils.read_json_file(their_file_path)

147 if their_file and their_file.similarities then

148 for _, entry in ipairs(their_file.similarities) do

149 if tonumber(entry.id) == poem_id then

150 table.insert(similarities, {

151 id = other_id,

152 similarity = entry.similarity

153 })

154 break

155 end

156 end

157 end

158 end

159 end

160

161 -- Sort by similarity (descending)

162 table.sort(similarities, function(a, b)

163 return a.similarity > b.similarity

164 end)

165

166 return similarities

167end

168-- }}}

169

170-- {{{ function M.clear_cache

171-- Clear the file cache (useful between operations or for memory management)

172function M.clear_cache()

173 file_cache = {}

174 cache_hits = 0

175 cache_misses = 0

176end

177-- }}}

178

179-- {{{ function M.get_cache_stats

180-- Get cache performance statistics

181-- @return table with {hits, misses, hit_rate, cache_size}

182function M.get_cache_stats()

183 local cache_size = 0

184 for _ in pairs(file_cache) do cache_size = cache_size + 1 end

185

186 local total_requests = cache_hits + cache_misses

187 local hit_rate = total_requests > 0 and (cache_hits / total_requests) or 0

188

189 return {

190 hits = cache_hits,

191 misses = cache_misses,

192 hit_rate = hit_rate,

193 cache_size = cache_size,

194 max_cache_size = MAX_CACHE_SIZE

195 }

196end

197-- }}}

198

199return M

200