68 lines
2.2 KiB
Bash
68 lines
2.2 KiB
Bash
|
|
#!/bin/bash
|
||
|
|
# refine-source.sh — find the exact conversation region a journal entry refers to
|
||
|
|
#
|
||
|
|
# Usage: refine-source.sh JSONL_PATH GREP_LINE "journal entry text"
|
||
|
|
#
|
||
|
|
# Takes the rough grep hit and feeds ~2000 lines of context around it
|
||
|
|
# to an agent that identifies the exact start/end of the relevant exchange.
|
||
|
|
# Outputs: START_LINE:END_LINE
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
JSONL="$1"
|
||
|
|
GREP_LINE="${2:-0}"
|
||
|
|
TEXT="$3"
|
||
|
|
|
||
|
|
# Take 2000 lines centered on the grep hit (or end of file if no hit)
|
||
|
|
TOTAL=$(wc -l < "$JSONL")
|
||
|
|
if [ "$GREP_LINE" -eq 0 ] || [ "$GREP_LINE" -gt "$TOTAL" ]; then
|
||
|
|
# No grep hit — use last 2000 lines
|
||
|
|
START=$(( TOTAL > 2000 ? TOTAL - 2000 : 1 ))
|
||
|
|
else
|
||
|
|
START=$(( GREP_LINE > 1000 ? GREP_LINE - 1000 : 1 ))
|
||
|
|
fi
|
||
|
|
END=$(( START + 2000 ))
|
||
|
|
if [ "$END" -gt "$TOTAL" ]; then
|
||
|
|
END="$TOTAL"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Extract the conversation chunk, parse to readable format
|
||
|
|
CHUNK=$(sed -n "${START},${END}p" "$JSONL" | python3 -c "
|
||
|
|
import sys, json
|
||
|
|
for i, line in enumerate(sys.stdin, start=$START):
|
||
|
|
try:
|
||
|
|
obj = json.loads(line)
|
||
|
|
t = obj.get('type', '')
|
||
|
|
if t == 'assistant':
|
||
|
|
msg = obj.get('message', {})
|
||
|
|
content = msg.get('content', '')
|
||
|
|
if isinstance(content, list):
|
||
|
|
text = ' '.join(c.get('text', '')[:200] for c in content if c.get('type') == 'text')
|
||
|
|
else:
|
||
|
|
text = str(content)[:200]
|
||
|
|
if text.strip():
|
||
|
|
print(f'L{i} [assistant]: {text}')
|
||
|
|
elif t == 'user':
|
||
|
|
msg = obj.get('message', {})
|
||
|
|
content = msg.get('content', '')
|
||
|
|
if isinstance(content, list):
|
||
|
|
for c in content:
|
||
|
|
if isinstance(c, dict) and c.get('type') == 'text':
|
||
|
|
print(f'L{i} [user]: {c[\"text\"][:200]}')
|
||
|
|
elif isinstance(c, str):
|
||
|
|
print(f'L{i} [user]: {c[:200]}')
|
||
|
|
elif isinstance(content, str) and content.strip():
|
||
|
|
print(f'L{i} [user]: {content[:200]}')
|
||
|
|
except (json.JSONDecodeError, KeyError):
|
||
|
|
pass
|
||
|
|
" 2>/dev/null)
|
||
|
|
|
||
|
|
if [ -z "$CHUNK" ]; then
|
||
|
|
echo "0:0"
|
||
|
|
exit 0
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Ask Sonnet to find the exact region
|
||
|
|
# For now, output the chunk range — agent integration comes next
|
||
|
|
echo "${START}:${END}"
|