scripts/update
#!/bin/bash
Content extraction script with ZIP archive support
Automatically detects ZIP archives and extracts JSON data for poem processing pipeline
Implements freshness checking to skip extraction when outputs are up to date
set -euo pipefail
{{{ setup_dir_path
setup_dir_path() {
if [ -n "$1" ]; then
echo "$1"
else
echo "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
fi
}
}}}
{{{ is_extraction_fresh
Check if extracted poems.json files are newer than source ZIP archives
Returns 0 (true) if fresh, 1 (false) if extraction needed
is_extraction_fresh() {
local dir="$1"
# Output files to check
local outputs=(
"${dir}/input/fediverse/files/poems.json"
"${dir}/input/messages/files/poems.json"
"${dir}/input/notes/files/poems.json"
"${dir}/input/bluesky/files/poems.json"
)
# Find all ZIP archives and CAR files in input directory
local archives=()
while IFS= read -r -d '' archive; do
archives+=("$archive")
done < <(find "${dir}/input" \( -name ".zip" -o -name ".car" \) -type f -print0 2>/dev/null)
# If no archives found, nothing to extract
if [ ${#archives[@]} -eq 0 ]; then
return 0
fi
# Check if all output files exist
for output in "${outputs[@]}"; do
if [ ! -f "$output" ]; then
return 1 # Output missing, need to extract
fi
done
# Get the oldest output file modification time
local oldest_output_mtime=999999999999
for output in "${outputs[@]}"; do
local mtime
mtime=$(stat -c %Y "$output" 2>/dev/null || echo "0")
if [ "$mtime" -lt "$oldest_output_mtime" ]; then
oldest_output_mtime=$mtime
fi
done
# Get the newest archive file modification time
local newest_archive_mtime=0
for archive in "${archives[@]}"; do
local mtime
mtime=$(stat -c %Y "$archive" 2>/dev/null || echo "0")
if [ "$mtime" -gt "$newest_archive_mtime" ]; then
newest_archive_mtime=$mtime
fi
done
# Fresh if oldest output is newer than newest archive
if [ "$oldest_output_mtime" -gt "$newest_archive_mtime" ]; then
return 0 # Fresh
else
return 1 # Stale, need to extract
fi
}
}}}
Parse command line arguments for project directory and options
SCRIPT_DIR=""
FORCE_EXTRACT=false
INCLUDE_BOOSTS=false
while [[ $# -gt 0 ]]; do
case $1 in
--force|-f)
FORCE_EXTRACT=true
shift
;;
--include-boosts)
INCLUDE_BOOSTS=true
shift
;;
-*)
echo "Unknown option: $1" >&2
exit 1
;;
*)
SCRIPT_DIR="$1"
shift
;;
esac
done
Set up project directory
DIR=$(setup_dir_path "$SCRIPT_DIR")
Check if extraction is needed (freshness check)
if [ "$FORCE_EXTRACT" = false ] && is_extraction_fresh "$DIR"; then
echo "โ
Extraction data is up to date, skipping ZIP extraction"
echo " (use --force to extract anyway)"
exit 0
fi
echo "๐ Starting content extraction with ZIP archive support..."
NOTE: External file syncing happens in Stage 1 (update-words), not here.
By this point, all external files (ZIPs, notes, images) should already be in input/.
Guarantee the tmpfs-backed tmp/ symlink exists before any working directory
is created underneath it. Issue 8-059 unified the previous split between
temp/ and tmp/ into a single tmpfs path.
"${DIR}/scripts/ensure-tmp-symlink" "${DIR}"
Create temporary extraction directory
TEMP_EXTRACT_DIR="${DIR}/tmp/extract-$(date +%s)"
mkdir -p "${TEMP_EXTRACT_DIR}"
Extract ZIP archives to temporary locations
echo "๐ฆ Extracting ZIP archives..."
lua "${DIR}/scripts/zip-extractor.lua" "${DIR}" "${TEMP_EXTRACT_DIR}" || {
echo "Error: ZIP extraction failed" >&2
rm -rf "${TEMP_EXTRACT_DIR}"
exit 1
}
Check extraction summary to see what was extracted
EXTRACTION_SUMMARY="${TEMP_EXTRACT_DIR}/extraction-summary.json"
if [ ! -f "${EXTRACTION_SUMMARY}" ]; then
echo "Error: Extraction summary not found" >&2
rm -rf "${TEMP_EXTRACT_DIR}"
exit 1
fi
Extract content from temporary ZIP extractions
echo "๐ฑ Processing extracted archives..."
Check for fediverse data
if [ -f "${TEMP_EXTRACT_DIR}/fediverse/extract/outbox.json" ]; then
echo "๐ฑ Extracting fediverse content..."
# Issue 8-011: Pass boost inclusion flag if enabled
BOOST_FLAG=""
if $INCLUDE_BOOSTS; then
BOOST_FLAG="--include-boosts"
echo " ๐ค Including fediverse boosts in extraction"
fi
lua "${DIR}/scripts/extract-fediverse.lua" "${DIR}" "${TEMP_EXTRACT_DIR}/fediverse" $BOOST_FLAG || {
echo "Warning: Fediverse extraction failed" >&2
}
else
echo "โน๏ธ No fediverse data found in archives"
fi
Check for messages data
if [ -f "${TEMP_EXTRACT_DIR}/messages/extract/export.json" ]; then
echo "๐ฌ Extracting messages content..."
lua "${DIR}/scripts/extract-messages.lua" "${DIR}" "${TEMP_EXTRACT_DIR}/messages" || {
echo "Warning: Messages extraction failed" >&2
}
else
echo "โน๏ธ No messages data found in archives"
fi
Check for notes data
if [ -d "${TEMP_EXTRACT_DIR}/notes/extract" ] && [ "$(find "${TEMP_EXTRACT_DIR}/notes/extract" -type f | head -1)" ]; then
echo "๐ Extracting notes content..."
lua "${DIR}/scripts/extract-notes.lua" "${DIR}" "${TEMP_EXTRACT_DIR}/notes" || {
echo "Warning: Notes extraction failed" >&2
}
else
echo "โน๏ธ No notes data found in archives"
fi
Also process notes from main directory if not from ZIP
if [ -d "${DIR}/input/notes" ] && [ "$(find "${DIR}/input/notes" -type f -not -name '.*' | head -1)" ]; then
echo "๐ Processing notes from main directory..."
bash "${DIR}/scripts/run-notes" "${DIR}" || {
echo "Warning: Main notes processing failed" >&2
}
fi
Check for Bluesky CAR files (find newest by mtime, not specific filenames)
This allows any repo-.car or .car file to be used
CAR_FILE=$(find "${DIR}/input/bluesky" -name "*.car" -type f -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
if [ -n "$CAR_FILE" ]; then
echo "๐ฆ Extracting Bluesky content from $(basename "$CAR_FILE")..."
"${DIR}/scripts/extract-bluesky-data" "$CAR_FILE" "${DIR}/input/bluesky/files/poems.json" || {
echo "Warning: Bluesky extraction failed" >&2
}
else
echo "โน๏ธ No Bluesky CAR files found in input/bluesky/"
fi
Preserve media attachments before cleanup
echo "๐ผ๏ธ Preserving media attachments..."
MEDIA_DEST="${DIR}/input/media_attachments"
if [ -d "${TEMP_EXTRACT_DIR}/fediverse/extract/media_attachments" ]; then
mkdir -p "${MEDIA_DEST}"
cp -rn "${TEMP_EXTRACT_DIR}/fediverse/extract/media_attachments/"* "${MEDIA_DEST}/" 2>/dev/null || true
MEDIA_COUNT=$(find "${MEDIA_DEST}" -type f \( -name ".png" -o -name ".jpg" -o -name ".jpeg" -o -name ".gif" -o -name "*.webp" \) 2>/dev/null | wc -l)
echo " ๐ท Preserved ${MEDIA_COUNT} media files to ./${MEDIA_DEST#${DIR}/}"
fi
Cleanup temporary files
echo "๐งน Cleaning up temporary extraction files..."
rm -rf "${TEMP_EXTRACT_DIR}"
echo "โ
Content extraction complete"
echo " ๐ JSON files generated from ZIP archives for HTML pipeline"