libs/triangular-similarity-access.lua

200 lines

1-- Triangular Similarity Access Utility
2-- Provides transparent access to triangular individual similarity files
3-- Handles symmetric lookup: similarity(A,B) = similarity(B,A)
4-- Storage format: each file contains only pairs where this_id < other_id
5
6local utils = require('utils')
7local dkjson = require('dkjson')
8
9local M = {}
10
11-- File cache to avoid repeated I/O during batch operations
12local file_cache = {}
13local MAX_CACHE_SIZE = 100
14local cache_hits = 0
15local cache_misses = 0
16
17-- {{{ function M.get_similarity
18-- @param poem_a: first poem ID
19-- @param poem_b: second poem ID
20-- @param embeddings_dir: optional directory path (default: derived from the currently selected inference model via inference-server-config)
21-- @return similarity score (0.0 to 1.0)
22function M.get_similarity(poem_a, poem_b, embeddings_dir)
23 -- Handle self-similarity
24 if poem_a == poem_b then
25 return 1.0
26 end
27
28 -- Ensure triangular ordering (min_id < max_id)
29 local min_id = math.min(tonumber(poem_a), tonumber(poem_b))
30 local max_id = math.max(tonumber(poem_a), tonumber(poem_b))
31
32 -- Default embeddings directory
33 embeddings_dir = embeddings_dir or require("inference-server-config").get_selected_model():gsub("[^%w%-_.]", "_")
34
35 -- Build file path for the smaller ID
36 -- Issue 10-054: similarity files are movable (embeddings_dir, RAM).
37 local file_path = utils.similarities_dir(embeddings_dir) .. string.format("/poem_%d.json", min_id)
38
39 -- Load file (with caching disabled for thread safety)
40 local file_data = utils.read_json_file(file_path)
41 if not file_data or not file_data.similarities then
42 return 0.0
43 end
44
45 -- Search for the larger ID in the similarities array
46 for _, entry in ipairs(file_data.similarities) do
47 if tonumber(entry.id) == max_id then
48 return entry.similarity
49 end
50 end
51
52 -- Not found (shouldn't happen with complete data)
53 return 0.0
54end
55-- }}}
56
57-- {{{ function M.get_similarity_cached
58-- Cached version for batch operations (NOT thread-safe)
59-- @param poem_a: first poem ID
60-- @param poem_b: second poem ID
61-- @param embeddings_dir: optional directory path
62-- @return similarity score (0.0 to 1.0)
63function M.get_similarity_cached(poem_a, poem_b, embeddings_dir)
64 if poem_a == poem_b then
65 return 1.0
66 end
67
68 local min_id = math.min(tonumber(poem_a), tonumber(poem_b))
69 local max_id = math.max(tonumber(poem_a), tonumber(poem_b))
70
71 embeddings_dir = embeddings_dir or require("inference-server-config").get_selected_model():gsub("[^%w%-_.]", "_")
72
73 -- Check cache first
74 local cache_key = string.format("%s:%d", embeddings_dir, min_id)
75
76 if not file_cache[cache_key] then
77 -- Cache miss - load file
78 cache_misses = cache_misses + 1
79
80 local file_path = utils.similarities_dir(embeddings_dir) .. string.format("/poem_%d.json", min_id)
81
82 file_cache[cache_key] = utils.read_json_file(file_path)
83
84 -- Evict oldest entry if cache full (simple FIFO)
85 local cache_size = 0
86 for _ in pairs(file_cache) do cache_size = cache_size + 1 end
87
88 if cache_size > MAX_CACHE_SIZE then
89 local oldest_key = next(file_cache)
90 file_cache[oldest_key] = nil
91 end
92 else
93 cache_hits = cache_hits + 1
94 end
95
96 local file_data = file_cache[cache_key]
97 if not file_data or not file_data.similarities then
98 return 0.0
99 end
100
101 -- Search for similarity
102 for _, entry in ipairs(file_data.similarities) do
103 if tonumber(entry.id) == max_id then
104 return entry.similarity
105 end
106 end
107
108 return 0.0
109end
110-- }}}
111
112-- {{{ function M.get_all_similarities_for_poem
113-- Get all similarities for a specific poem (collects from triangular storage)
114-- This requires reading multiple files:
115-- 1. This poem's file (for higher IDs)
116-- 2. All lower-ID files that may reference this poem
117-- @param poem_id: the poem to get similarities for
118-- @param all_poem_ids: list of all valid poem IDs in corpus
119-- @param embeddings_dir: optional directory path
120-- @return array of {id, similarity} sorted by similarity (descending)
121function M.get_all_similarities_for_poem(poem_id, all_poem_ids, embeddings_dir)
122 poem_id = tonumber(poem_id)
123 embeddings_dir = embeddings_dir or require("inference-server-config").get_selected_model():gsub("[^%w%-_.]", "_")
124
125 local similarities = {}
126
127 -- 1. Load this poem's file (contains similarities to higher IDs)
128 local my_file_path = utils.similarities_dir(embeddings_dir) .. string.format("/poem_%d.json", poem_id)
129
130 local my_file = utils.read_json_file(my_file_path)
131 if my_file and my_file.similarities then
132 for _, entry in ipairs(my_file.similarities) do
133 table.insert(similarities, {
134 id = tonumber(entry.id),
135 similarity = entry.similarity
136 })
137 end
138 end
139
140 -- 2. Check all lower-ID files for references to this poem
141 for _, other_id in ipairs(all_poem_ids) do
142 other_id = tonumber(other_id)
143 if other_id < poem_id then
144 local their_file_path = utils.similarities_dir(embeddings_dir) .. string.format("/poem_%d.json", other_id)
145
146 local their_file = utils.read_json_file(their_file_path)
147 if their_file and their_file.similarities then
148 for _, entry in ipairs(their_file.similarities) do
149 if tonumber(entry.id) == poem_id then
150 table.insert(similarities, {
151 id = other_id,
152 similarity = entry.similarity
153 })
154 break
155 end
156 end
157 end
158 end
159 end
160
161 -- Sort by similarity (descending)
162 table.sort(similarities, function(a, b)
163 return a.similarity > b.similarity
164 end)
165
166 return similarities
167end
168-- }}}
169
170-- {{{ function M.clear_cache
171-- Clear the file cache (useful between operations or for memory management)
172function M.clear_cache()
173 file_cache = {}
174 cache_hits = 0
175 cache_misses = 0
176end
177-- }}}
178
179-- {{{ function M.get_cache_stats
180-- Get cache performance statistics
181-- @return table with {hits, misses, hit_rate, cache_size}
182function M.get_cache_stats()
183 local cache_size = 0
184 for _ in pairs(file_cache) do cache_size = cache_size + 1 end
185
186 local total_requests = cache_hits + cache_misses
187 local hit_rate = total_requests > 0 and (cache_hits / total_requests) or 0
188
189 return {
190 hits = cache_hits,
191 misses = cache_misses,
192 hit_rate = hit_rate,
193 cache_size = cache_size,
194 max_cache_size = MAX_CACHE_SIZE
195 }
196end
197-- }}}
198
199return M
200