libs/text-chunking-test.lua

144 lines

1-- {{{ text-chunking-test.lua

2-- Issue 10-050: unit tests for libs/text-chunking.lua.

3--

4-- Pure-function tests — no inference server, no files. Run directly:

5-- luajit libs/text-chunking-test.lua

6-- Exits non-zero on failure so it can gate a build/deploy step.

7--

8-- Sizing is token-exact via an injected counter. `count_chars` (1 token per

9-- character) is a perfectly additive mock, so a max_tokens of N behaves like an

10-- N-character limit AND the returned per-chunk counts equal the chunk lengths.

11-- }}}

13package.path = "libs/?.lua;" .. package.path

14local chunk = require("text-chunking")

16local failures = 0

17local tests = 0

19-- {{{ local function check(name, condition, detail)

20local function check(name, condition, detail)

21 tests = tests + 1

22 if condition then

23 print(string.format(" ok %s", name))

24 else

25 failures = failures + 1

26 print(string.format(" FAIL %s%s", name, detail and (" -> " .. detail) or ""))

27 end

28end

29-- }}}

31local count_chars = function(s) return #s end

33-- {{{ short text is returned whole

34local single = chunk.chunk_text_by_tokens("a short poem", count_chars, 7200)

35check("short text -> single chunk", #single == 1 and single[1] == "a short poem")

36-- }}}

38-- {{{ empty / whitespace -> no chunks

39check("empty string -> {}", #chunk.chunk_text_by_tokens("", count_chars, 7200) == 0)

40check("whitespace -> {}", #chunk.chunk_text_by_tokens(" \n\t ", count_chars, 7200) == 0)

41-- }}}

43-- {{{ max_tokens is required (no guessed default)

44local ok_req = pcall(function() return chunk.chunk_text_by_tokens("hi", count_chars, nil) end)

45check("max_tokens is required (errors when nil)", ok_req == false)

46-- }}}

48-- {{{ splitting is LOSSLESS and counts are EXACT

49-- The most important invariant: chunking must never drop or alter a character.

50local parts = {}

51for i = 1, 60 do

52 parts[#parts + 1] = "Paragraph number " .. i .. " has several words in it. "

53 .. "It also has a second sentence to give the splitter something to chew on."

54end

55local long_text = table.concat(parts, "\n\n")

56local chunks, counts = chunk.chunk_text_by_tokens(long_text, count_chars, 500)

57check("long text actually split into many chunks", #chunks > 1, "got " .. #chunks)

58check("lossless: concat(chunks) == original", table.concat(chunks) == long_text)

59-- With an additive counter, every returned count must equal the chunk's length.

60local counts_exact = (#counts == #chunks)

61for i = 1, #chunks do

62 if counts[i] ~= #chunks[i] then counts_exact = false end

63end

64check("returned per-chunk counts are exact (additive counter)", counts_exact)

65-- }}}

67-- {{{ every chunk respects the max size

68local all_within = true

69for _, c in ipairs(chunks) do

70 if #c > 500 then all_within = false end

71end

72check("every chunk <= max_tokens", all_within)

73-- }}}

75-- {{{ degenerate input: one giant unbreakable token -> hard split, still lossless

76local giant = string.rep("x", 1730) -- no separators at all

77local giant_chunks = chunk.chunk_text_by_tokens(giant, count_chars, 500)

78local giant_ok = (#giant_chunks == 4) and (table.concat(giant_chunks) == giant)

79check("giant unbreakable token hard-splits losslessly", giant_ok,

80 "chunks=" .. #giant_chunks)

81-- }}}

83-- {{{ counter drives splitting with a NON-char token ratio

84-- tok4 reports 1 token per 4 chars (ceil). Verify splitting respects the TOKEN

85-- limit, not the char length, and stays lossless.

86local function tok4(s) return math.ceil(#s / 4) end

87local parts2 = {}

88for i = 1, 50 do parts2[#parts2 + 1] = "Sentence number " .. i .. " has a handful of words." end

89local body = table.concat(parts2, "\n\n")

90local tchunks = chunk.chunk_text_by_tokens(body, tok4, 30) -- ~120 chars/chunk ceiling

91check("token chunk: long text splits into many", #tchunks > 1, "chunks=" .. #tchunks)

92check("token chunk: lossless (concat == original)", table.concat(tchunks) == body)

93local all_under = true

94for _, c in ipairs(tchunks) do if tok4(c) > 30 then all_under = false end end

95check("token chunk: every chunk <= max_tokens by the counter", all_under)

96-- }}}

98-- {{{ token chunking: no-separator blob hard-splits by tokens, losslessly

99local blob = string.rep("z", 250) -- no separators of any kind

100local bchunks = chunk.chunk_text_by_tokens(blob, count_chars, 60)

101local blossless = (table.concat(bchunks) == blob)

102local bunder = true

103for _, c in ipairs(bchunks) do if #c > 60 then bunder = false end end

104check("token chunk: no-separator blob hard-splits losslessly",

105 blossless and bunder and #bchunks == 5, "chunks=" .. #bchunks)

106-- }}}

107

108-- {{{ combine: single vector returned as-is

109local one = chunk.combine_chunk_vectors({ {1, 2, 3} })

110check("combine single vector -> identity", one[1] == 1 and one[2] == 2 and one[3] == 3)

111-- }}}

112

113-- {{{ combine: plain mean

114local mean = chunk.combine_chunk_vectors({ {0, 0}, {2, 4} }, nil, "mean")

115check("mean of (0,0),(2,4) = (1,2)", mean[1] == 1 and mean[2] == 2)

116-- }}}

117

118-- {{{ combine: length-weighted mean favours the longer chunk

119-- weights 1 and 3: (1*[0,0] + 3*[4,8]) / 4 = [3,6]

120local lwm = chunk.combine_chunk_vectors({ {0, 0}, {4, 8} }, { 1, 3 }, "length_weighted_mean")

121check("length-weighted mean tilts toward heavier chunk", lwm[1] == 3 and lwm[2] == 6)

122-- }}}

123

124-- {{{ combine: first_only drops the tail

125local first = chunk.combine_chunk_vectors({ {9, 9}, {0, 0} }, nil, "first_only")

126check("first_only keeps only chunk 1", first[1] == 9 and first[2] == 9)

127-- }}}

128

129-- {{{ combine: dimension mismatch errors loudly (no silent garbage)

130local ok_err = pcall(function()

131 chunk.combine_chunk_vectors({ {1, 2, 3}, {1, 2} }, nil, "mean")

132end)

133check("dimension mismatch raises an error", ok_err == false)

134-- }}}

135

136-- {{{ summary

137print(string.format("\n%d/%d checks passed", tests - failures, tests))

138if failures > 0 then

139 os.exit(1)

140end

141-- }}}

142

143-- vim: set foldmethod=marker:

144