src/similarity-calculator.lua

362 lines

1#!/usr/bin/env lua
2
3-- Modular Similarity Calculator
4-- Pluggable architecture for testing different similarity algorithms
5
6package.path = package.path .. ';./?.lua;./libs/?.lua'
7
8local utils = require("libs.utils")
9local json = require("libs.json")
10
11local DIR = "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
12
13-- Issue 10-003: Load unified config from config.lua
14local config_loader = require("libs.config-loader")
15config_loader.set_project_root(DIR)
16local unified_config = config_loader.load()
17
18local SimilarityCalculator = {}
19SimilarityCalculator.__index = SimilarityCalculator
20
21-- {{{ function SimilarityCalculator:new
22function SimilarityCalculator:new(algorithm_name, config)
23 local obj = {
24 algorithm = algorithm_name or "cosine",
25 config = config or {},
26 supported_algorithms = {
27 "cosine",
28 "euclidean",
29 "manhattan",
30 "dot_product",
31 "normalized_euclidean",
32 "chebyshev",
33 "angular",
34 "pearson_correlation"
35 }
36 }
37 setmetatable(obj, SimilarityCalculator)
38
39 -- Validate algorithm is supported
40 local valid_algorithm = false
41 for _, algo in ipairs(obj.supported_algorithms) do
42 if algo == algorithm_name then
43 valid_algorithm = true
44 break
45 end
46 end
47
48 if not valid_algorithm then
49 error(string.format("Unsupported similarity algorithm: %s. Supported: %s",
50 algorithm_name, table.concat(obj.supported_algorithms, ", ")))
51 end
52
53 return obj
54end
55-- }}}
56
57-- {{{ function SimilarityCalculator:calculate
58function SimilarityCalculator:calculate(embedding_a, embedding_b)
59 if not embedding_a or not embedding_b then
60 error("Both embeddings must be provided")
61 end
62
63 if #embedding_a ~= #embedding_b then
64 error(string.format("Vector dimensions must match: %d vs %d", #embedding_a, #embedding_b))
65 end
66
67 if self.algorithm == "cosine" then
68 return self:cosine_similarity(embedding_a, embedding_b)
69 elseif self.algorithm == "euclidean" then
70 return self:euclidean_distance_to_similarity(embedding_a, embedding_b)
71 elseif self.algorithm == "manhattan" then
72 return self:manhattan_distance_to_similarity(embedding_a, embedding_b)
73 elseif self.algorithm == "dot_product" then
74 return self:dot_product_similarity(embedding_a, embedding_b)
75 elseif self.algorithm == "normalized_euclidean" then
76 return self:normalized_euclidean_similarity(embedding_a, embedding_b)
77 elseif self.algorithm == "chebyshev" then
78 return self:chebyshev_distance_to_similarity(embedding_a, embedding_b)
79 elseif self.algorithm == "angular" then
80 return self:angular_similarity(embedding_a, embedding_b)
81 elseif self.algorithm == "pearson_correlation" then
82 return self:pearson_correlation(embedding_a, embedding_b)
83 else
84 error(string.format("Algorithm implementation missing: %s", self.algorithm))
85 end
86end
87-- }}}
88
89-- {{{ function SimilarityCalculator:cosine_similarity
90function SimilarityCalculator:cosine_similarity(vec_a, vec_b)
91 local dot_product = 0
92 local norm_a = 0
93 local norm_b = 0
94
95 for i = 1, #vec_a do
96 dot_product = dot_product + (vec_a[i] * vec_b[i])
97 norm_a = norm_a + (vec_a[i] * vec_a[i])
98 norm_b = norm_b + (vec_b[i] * vec_b[i])
99 end
100
101 norm_a = math.sqrt(norm_a)
102 norm_b = math.sqrt(norm_b)
103
104 if norm_a == 0 or norm_b == 0 then
105 return 0 -- Handle zero vectors
106 end
107
108 return dot_product / (norm_a * norm_b)
109end
110-- }}}
111
112-- {{{ function SimilarityCalculator:euclidean_distance_to_similarity
113function SimilarityCalculator:euclidean_distance_to_similarity(vec_a, vec_b)
114 local sum_squared_diff = 0
115
116 for i = 1, #vec_a do
117 local diff = vec_a[i] - vec_b[i]
118 sum_squared_diff = sum_squared_diff + (diff * diff)
119 end
120
121 local distance = math.sqrt(sum_squared_diff)
122
123 -- Convert distance to similarity using exponential decay
124 -- Similarity = e^(-distance)
125 return math.exp(-distance)
126end
127-- }}}
128
129-- {{{ function SimilarityCalculator:manhattan_distance_to_similarity
130function SimilarityCalculator:manhattan_distance_to_similarity(vec_a, vec_b)
131 local sum_abs_diff = 0
132
133 for i = 1, #vec_a do
134 sum_abs_diff = sum_abs_diff + math.abs(vec_a[i] - vec_b[i])
135 end
136
137 -- Convert distance to similarity using exponential decay
138 return math.exp(-sum_abs_diff)
139end
140-- }}}
141
142-- {{{ function SimilarityCalculator:dot_product_similarity
143function SimilarityCalculator:dot_product_similarity(vec_a, vec_b)
144 local dot_product = 0
145
146 for i = 1, #vec_a do
147 dot_product = dot_product + (vec_a[i] * vec_b[i])
148 end
149
150 -- Normalize to 0-1 range (assumes input vectors are normalized)
151 return (dot_product + 1) / 2
152end
153-- }}}
154
155-- {{{ function SimilarityCalculator:normalized_euclidean_similarity
156function SimilarityCalculator:normalized_euclidean_similarity(vec_a, vec_b)
157 -- First normalize both vectors
158 local norm_a = 0
159 local norm_b = 0
160
161 for i = 1, #vec_a do
162 norm_a = norm_a + (vec_a[i] * vec_a[i])
163 norm_b = norm_b + (vec_b[i] * vec_b[i])
164 end
165
166 norm_a = math.sqrt(norm_a)
167 norm_b = math.sqrt(norm_b)
168
169 if norm_a == 0 or norm_b == 0 then
170 return 0
171 end
172
173 -- Calculate euclidean distance between normalized vectors
174 local sum_squared_diff = 0
175 for i = 1, #vec_a do
176 local norm_a_i = vec_a[i] / norm_a
177 local norm_b_i = vec_b[i] / norm_b
178 local diff = norm_a_i - norm_b_i
179 sum_squared_diff = sum_squared_diff + (diff * diff)
180 end
181
182 local distance = math.sqrt(sum_squared_diff)
183 return math.exp(-distance)
184end
185-- }}}
186
187-- {{{ function SimilarityCalculator:chebyshev_distance_to_similarity
188function SimilarityCalculator:chebyshev_distance_to_similarity(vec_a, vec_b)
189 local max_diff = 0
190
191 for i = 1, #vec_a do
192 local diff = math.abs(vec_a[i] - vec_b[i])
193 if diff > max_diff then
194 max_diff = diff
195 end
196 end
197
198 -- Convert distance to similarity
199 return math.exp(-max_diff)
200end
201-- }}}
202
203-- {{{ function SimilarityCalculator:angular_similarity
204function SimilarityCalculator:angular_similarity(vec_a, vec_b)
205 -- Angular similarity = 1 - (arccos(cosine_similarity) / π)
206 local cosine_sim = self:cosine_similarity(vec_a, vec_b)
207
208 -- Clamp to valid range for arccos
209 cosine_sim = math.max(-1, math.min(1, cosine_sim))
210
211 local angle = math.acos(cosine_sim)
212 return 1 - (angle / math.pi)
213end
214-- }}}
215
216-- {{{ function SimilarityCalculator:pearson_correlation
217function SimilarityCalculator:pearson_correlation(vec_a, vec_b)
218 local n = #vec_a
219 if n < 2 then
220 return 0
221 end
222
223 -- Calculate means
224 local mean_a = 0
225 local mean_b = 0
226 for i = 1, n do
227 mean_a = mean_a + vec_a[i]
228 mean_b = mean_b + vec_b[i]
229 end
230 mean_a = mean_a / n
231 mean_b = mean_b / n
232
233 -- Calculate correlation
234 local numerator = 0
235 local sum_sq_a = 0
236 local sum_sq_b = 0
237
238 for i = 1, n do
239 local diff_a = vec_a[i] - mean_a
240 local diff_b = vec_b[i] - mean_b
241 numerator = numerator + (diff_a * diff_b)
242 sum_sq_a = sum_sq_a + (diff_a * diff_a)
243 sum_sq_b = sum_sq_b + (diff_b * diff_b)
244 end
245
246 local denominator = math.sqrt(sum_sq_a * sum_sq_b)
247 if denominator == 0 then
248 return 0
249 end
250
251 local correlation = numerator / denominator
252 return (correlation + 1) / 2 -- Normalize to 0-1 range
253end
254-- }}}
255
256-- {{{ function SimilarityCalculator:get_algorithm_info
257function SimilarityCalculator:get_algorithm_info()
258 return {
259 name = self.algorithm,
260 supported_algorithms = self.supported_algorithms,
261 config = self.config,
262 description = self:get_algorithm_description()
263 }
264end
265-- }}}
266
267-- {{{ function SimilarityCalculator:get_algorithm_description
268function SimilarityCalculator:get_algorithm_description()
269 local descriptions = {
270 cosine = "Cosine similarity - measures angle between vectors, standard for text embeddings",
271 euclidean = "Euclidean distance converted to similarity - measures straight-line distance",
272 manhattan = "Manhattan distance converted to similarity - measures city-block distance",
273 dot_product = "Dot product similarity - measures vector alignment",
274 normalized_euclidean = "Euclidean distance on normalized vectors",
275 chebyshev = "Chebyshev distance - measures maximum dimension difference",
276 angular = "Angular similarity - normalized angle between vectors",
277 pearson_correlation = "Pearson correlation coefficient - measures linear correlation"
278 }
279
280 return descriptions[self.algorithm] or "Unknown algorithm"
281end
282-- }}}
283
284-- {{{ function SimilarityCalculator:validate_implementation
285function SimilarityCalculator:validate_implementation()
286 -- Test with simple known vectors
287 local test_cases = {
288 {
289 name = "identical_vectors",
290 vec_a = {1, 0, 0},
291 vec_b = {1, 0, 0},
292 expected_similarity = 1.0,
293 tolerance = 0.001
294 },
295 {
296 name = "orthogonal_vectors",
297 vec_a = {1, 0, 0},
298 vec_b = {0, 1, 0},
299 expected_similarity = 0.0,
300 tolerance = 0.6 -- Distance-based algorithms may not give exactly 0
301 },
302 {
303 name = "opposite_vectors",
304 vec_a = {1, 0, 0},
305 vec_b = {-1, 0, 0},
306 expected_similarity = -1.0, -- Cosine similarity of opposite vectors is -1
307 tolerance = 1.2 -- Allow for algorithm differences
308 }
309 }
310
311 local results = {
312 algorithm = self.algorithm,
313 validation_results = {},
314 all_tests_passed = true
315 }
316
317 for _, test_case in ipairs(test_cases) do
318 local calculated_similarity = self:calculate(test_case.vec_a, test_case.vec_b)
319 local difference = math.abs(calculated_similarity - test_case.expected_similarity)
320 local passed = difference <= test_case.tolerance
321
322 if not passed then
323 results.all_tests_passed = false
324 end
325
326 table.insert(results.validation_results, {
327 test_name = test_case.name,
328 expected = test_case.expected_similarity,
329 calculated = calculated_similarity,
330 difference = difference,
331 tolerance = test_case.tolerance,
332 passed = passed
333 })
334 end
335
336 return results
337end
338-- }}}
339
340-- {{{ function create_from_config
341-- Issue 10-003: Use unified config instead of similarity-calculator-settings.json
342local function create_from_config(algorithm_name)
343 -- Get default algorithm from unified config
344 local sim_config = unified_config.similarity or {}
345 algorithm_name = algorithm_name or sim_config.default_algorithm or "cosine"
346
347 return SimilarityCalculator:new(algorithm_name)
348end
349-- }}}
350
351-- {{{ function get_available_algorithms
352local function get_available_algorithms()
353 local temp_calc = SimilarityCalculator:new("cosine")
354 return temp_calc.supported_algorithms
355end
356-- }}}
357
358return {
359 SimilarityCalculator = SimilarityCalculator,
360 create_from_config = create_from_config,
361 get_available_algorithms = get_available_algorithms
362}