libs/embed-chunking-test.lua

217 lines

1-- {{{ embed-chunking-test.lua
2-- Issue 10-050: offline tests for fuzzy-computing's chunk+batch+recombine core
3-- (M._embed_with_chunking_impl). Uses a MOCK embedder AND a mock token counter,
4-- so no inference server is needed — this exercises the index bookkeeping that a
5-- live test cannot isolate.
6-- luajit libs/embed-chunking-test.lua
7--
8-- The mock counter `count_chars` returns one token per character, which makes
9-- the token-based chunker behave identically to a char limit of the same number
10-- — so a max_tokens of 7200 here splits exactly where a 7200-char limit would.
11-- }}}
12
13package.path = "libs/?.lua;./?.lua;" .. package.path
14local fuzzy = require("fuzzy-computing")
15local chunk = require("text-chunking")
16
17local failures, tests = 0, 0
18-- {{{ local function check(name, cond, detail)
19local function check(name, cond, detail)
20 tests = tests + 1
21 if cond then
22 print(" ok " .. name)
23 else
24 failures = failures + 1
25 print(" FAIL " .. name .. (detail and (" -> " .. detail) or ""))
26 end
27end
28-- }}}
29
30-- count_chars: exact "tokenizer" of 1 token per character (deterministic mock).
31local count_chars = function(s) return #s end
32
33-- {{{ mock embedder: returns a deterministic 2-D vector per input
34-- vector = { length_of_text, first_byte }. This lets a test predict exactly
35-- what a recombined vector should be. Also records how many sub-batches it saw
36-- and the largest sub-batch size, to verify BATCH_SIZE chunking of requests.
37local function make_mock()
38 local state = { calls = 0, max_sub = 0 }
39 local fn = function(sub)
40 state.calls = state.calls + 1
41 if #sub > state.max_sub then state.max_sub = #sub end
42 local out = {}
43 for i = 1, #sub do
44 out[i] = { #sub[i], string.byte(sub[i]) or 0 }
45 end
46 return out, nil
47 end
48 return fn, state
49end
50-- }}}
51
52-- {{{ short texts: one chunk each, vectors come back in input order
53local fn, st = make_mock()
54local out = fuzzy._embed_with_chunking_impl({ "aaa", "bbbb" }, fn, count_chars, 7200, "length_weighted_mean")
55check("two short texts -> two vectors", out and #out == 2)
56check("text 1 vector reflects its length/first-byte",
57 out[1][1] == 3 and out[1][2] == string.byte("a"))
58check("text 2 vector reflects its length/first-byte",
59 out[2][1] == 4 and out[2][2] == string.byte("b"))
60-- }}}
61
62-- {{{ a long text is chunked, embedded, and recombined into ONE vector
63local fn2 = make_mock()
64local long = string.rep("word ", 4000) -- 20000 chars -> multiple chunks at 7200
65local chunks = chunk.chunk_text_by_tokens(long, count_chars, 7200)
66check("sanity: long text splits into >1 chunk", #chunks > 1, "chunks=" .. #chunks)
67local out2 = fuzzy._embed_with_chunking_impl({ long }, fn2, count_chars, 7200, "length_weighted_mean")
68check("long text -> exactly one combined vector", out2 and #out2 == 1 and type(out2[1]) == "table")
69-- The combined dim-1 value is the length-weighted mean of each chunk's length,
70-- weighted by that same length: sum(len^2)/sum(len). Compute the expectation.
71local num, den = 0, 0
72for _, c in ipairs(chunks) do num = num + #c * #c; den = den + #c end
73check("recombined value matches length-weighted-mean formula",
74 math.abs(out2[1][1] - (num / den)) < 1e-6,
75 string.format("got %.4f want %.4f", out2[1][1], num / den))
76-- }}}
77
78-- {{{ mixed batch: short + long together, flat order preserved across the seam
79local fn3, st3 = make_mock()
80local out3 = fuzzy._embed_with_chunking_impl({ "short", long, "tiny" }, fn3, count_chars, 7200, "mean")
81check("mixed batch -> 3 vectors", out3 and #out3 == 3)
82check("short text before the long one is intact", out3[1][1] == 5)
83check("short text after the long one is intact", out3[3][1] == 4)
84-- }}}
85
86-- {{{ BATCH_SIZE bounds the per-request size
87-- 40 single-chunk texts with BATCH_SIZE 16 -> ceil(40/16)=3 calls, max sub 16.
88fuzzy.BATCH_SIZE = 16
89local fn4, st4 = make_mock()
90local many = {}
91for i = 1, 40 do many[i] = "t" .. i end
92fuzzy._embed_with_chunking_impl(many, fn4, count_chars, 7200, "mean")
93check("40 inputs split into 3 sub-batches", st4.calls == 3, "calls=" .. st4.calls)
94check("no sub-batch exceeds BATCH_SIZE", st4.max_sub <= 16, "max=" .. st4.max_sub)
95-- }}}
96
97-- {{{ total failure (server down): EVERY request fails -> (nil, all_requests_failed)
98local failing = function(_) return nil, "no_response" end
99local out5, err5 = fuzzy._embed_with_chunking_impl({ "a", "b" }, failing, count_chars, 7200, "mean")
100check("total failure -> nil + all_requests_failed",
101 out5 == nil and err5 == "all_requests_failed", "err=" .. tostring(err5))
102-- }}}
103
104-- {{{ a failed REQUEST isolates to its own items; sibling requests still embed
105-- Big inputs (~3000 est tokens each) each land in their own token-budgeted
106-- request; the mock fails only the request containing "BAD". Isolation is at
107-- request granularity: only that item goes nil, siblings embed.
108local req_isolate = function(sub)
109 for i = 1, #sub do if sub[i]:find("BAD") then return nil, "rejected" end end
110 local o = {}
111 for i = 1, #sub do o[i] = { #sub[i] } end
112 return o, nil
113end
114local g = string.rep("x", 12000) -- one chunk; ~3000 est tokens -> its own request
115local outR = fuzzy._embed_with_chunking_impl({ g, g .. "BAD", g }, req_isolate, count_chars, 99999, "mean")
116check("failed request -> its item nil, sibling requests still embed",
117 outR and outR[1] ~= nil and outR[2] == nil and outR[3] ~= nil)
118-- }}}
119
120-- {{{ token-budget packing: a few big chunks split into multiple requests
121-- Packing uses the EXACT per-chunk token counts (count_chars -> 12000 each here).
122-- With REQUEST_TOKEN_BUDGET=4000, three 12000-token inputs each take their own
123-- request -> >1 request.
124local pack_calls = 0
125local count_calls = function(sub)
126 pack_calls = pack_calls + 1
127 local o = {}
128 for i = 1, #sub do o[i] = { 1 } end
129 return o, nil
130end
131local big = string.rep("x", 12000) -- one chunk at max_tokens=99999
132fuzzy._embed_with_chunking_impl({ big, big, big }, count_calls, count_chars, 99999, "mean")
133check("token budget splits heavy inputs across requests", pack_calls >= 2,
134 "requests=" .. pack_calls)
135-- }}}
136
137-- {{{ a single missing chunk vector -> that text is nil, others survive
138local picky = function(sub)
139 local o = {}
140 for i = 1, #sub do
141 if sub[i]:find("DROP") then o[i] = nil else o[i] = { 1, 1 } end
142 end
143 return o, nil
144end
145local out6 = fuzzy._embed_with_chunking_impl({ "ok one", "DROP me", "ok two" }, picky, count_chars, 7200, "mean")
146check("missing-vector text -> nil", out6[2] == nil)
147check("neighbours of a missing-vector text still embed",
148 type(out6[1]) == "table" and type(out6[3]) == "table")
149-- }}}
150
151-- {{{ sanitize_utf8: drops bad bytes, keeps valid text, always yields valid UTF-8
152-- A STRICT validator (the kind nlohmann/the server enforces): rejects overlong
153-- encodings, UTF-16 surrogates, and code points > U+10FFFF, via per-lead-byte
154-- first-continuation ranges. A merely-structural check would give false
155-- confidence (it did, earlier — that is the bug this guards against).
156local function is_valid_utf8(s)
157 local i, n = 1, #s
158 while i <= n do
159 local c = s:byte(i)
160 local len, lo2, hi2
161 if c < 0x80 then len = 1
162 elseif c >= 0xC2 and c <= 0xDF then len, lo2, hi2 = 2, 0x80, 0xBF
163 elseif c == 0xE0 then len, lo2, hi2 = 3, 0xA0, 0xBF
164 elseif c >= 0xE1 and c <= 0xEC then len, lo2, hi2 = 3, 0x80, 0xBF
165 elseif c == 0xED then len, lo2, hi2 = 3, 0x80, 0x9F
166 elseif c >= 0xEE and c <= 0xEF then len, lo2, hi2 = 3, 0x80, 0xBF
167 elseif c == 0xF0 then len, lo2, hi2 = 4, 0x90, 0xBF
168 elseif c >= 0xF1 and c <= 0xF3 then len, lo2, hi2 = 4, 0x80, 0xBF
169 elseif c == 0xF4 then len, lo2, hi2 = 4, 0x80, 0x8F
170 else return false end
171 if len > 1 then
172 if i + len - 1 > n then return false end
173 local b2 = s:byte(i + 1)
174 if b2 < lo2 or b2 > hi2 then return false end
175 for k = 2, len - 1 do
176 local cc = s:byte(i + k)
177 if cc < 0x80 or cc > 0xBF then return false end
178 end
179 end
180 i = i + len
181 end
182 return true
183end
184
185local clean_ascii, r0 = fuzzy.sanitize_utf8("plain ascii poem")
186check("sanitize: clean ASCII unchanged", clean_ascii == "plain ascii poem" and r0 == 0)
187
188local mu = "micro \xC2\xB5 sign" -- µ properly encoded as 0xC2 0xB5
189local clean_mu, r1 = fuzzy.sanitize_utf8(mu)
190check("sanitize: valid 2-byte UTF-8 preserved", clean_mu == mu and r1 == 0)
191
192local lone, r2 = fuzzy.sanitize_utf8("bad \xB5 byte") -- lone 0xB5 = invalid
193check("sanitize: lone 0xB5 stripped", lone == "bad byte" and r2 == 1)
194
195-- the actual poison: a PDF header like poem 8169's first bytes
196local pdf = "%PDF-1.5\n%\xB5\xED\xAE\xFB\n4 0 obj"
197local clean_pdf, r3 = fuzzy.sanitize_utf8(pdf)
198check("sanitize: PDF binary header -> valid UTF-8, bytes removed",
199 is_valid_utf8(clean_pdf) and r3 > 0, "removed=" .. r3)
200
201-- strict cases a lenient check would WRONGLY accept:
202local overlong, ro = fuzzy.sanitize_utf8("x\xE0\x9A\xB1y") -- E0 9A = overlong (the real 8169 byte)
203check("sanitize: overlong 3-byte stripped", is_valid_utf8(overlong) and ro > 0, "got " .. overlong)
204local surrogate, rs = fuzzy.sanitize_utf8("x\xED\xA0\x80y") -- U+D800 surrogate
205check("sanitize: UTF-16 surrogate stripped", is_valid_utf8(surrogate) and rs > 0)
206local oob, rb = fuzzy.sanitize_utf8("x\xF4\x90\x80\x80y") -- > U+10FFFF
207check("sanitize: out-of-range 4-byte stripped", is_valid_utf8(oob) and rb > 0)
208-- valid 3- and 4-byte survive untouched
209local valid3 = "snowman \xE2\x98\x83 emoji \xF0\x9F\x98\x80" -- ☃ and 😀
210check("sanitize: valid 3- and 4-byte preserved", select(1, fuzzy.sanitize_utf8(valid3)) == valid3)
211-- }}}
212
213print(string.format("\n%d/%d checks passed", tests - failures, tests))
214os.exit(failures == 0 and 0 or 1)
215
216-- vim: set foldmethod=marker:
217