telegram: bridge photos via [image: <path>] markers

When an incoming update has a photo array, pick the largest size,
resolve the file_id via getFile, and download to
~/.consciousness/channels/telegram.logs/media/<file_id>.<ext>. The
message line surfaced to the channel is

    [image: /abs/path/to/file.jpg]
    <caption if any>

so a multimodal Read on the path works end-to-end. On download
failure we still surface the caption with an [image: download
failed: ...] marker so context isn't lost.

Other media types (voice/video/sticker/etc.) log a one-line "skipping"
notice — easy hook to extend later. The media/ dir was already being
created at startup; this fills in the rest.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-05-01 17:58:43 -04:00
commit c303653dd0

View file

@ -181,6 +181,8 @@ struct TelegramMessage {
chat_id: i64, chat_id: i64,
sender: String, sender: String,
text: String, text: String,
/// Absolute path to a downloaded media file (photo, etc.), if any.
media_path: Option<String>,
} }
/// Fetch and parse pending updates from Telegram via long polling. /// Fetch and parse pending updates from Telegram via long polling.
@ -206,19 +208,107 @@ async fn get_updates(
let sender = msg["from"]["first_name"].as_str().unwrap_or("unknown").to_string(); let sender = msg["from"]["first_name"].as_str().unwrap_or("unknown").to_string();
let chat_id = msg["chat"]["id"].as_i64().unwrap_or(0); let chat_id = msg["chat"]["id"].as_i64().unwrap_or(0);
if let Some(text) = msg["text"].as_str() { // Photo: array of PhotoSize, largest is last. Download largest,
// surface message with [image: <path>] marker so the multimodal
// model can Read the image.
let (text, media_path) = if let Some(sizes) = msg["photo"].as_array() {
let caption = msg["caption"].as_str().unwrap_or("").to_string();
let largest = sizes.last();
let file_id = largest
.and_then(|s| s["file_id"].as_str())
.unwrap_or("");
if file_id.is_empty() {
error!("telegram photo: missing file_id in update {update_id}");
(caption, None)
} else {
match download_telegram_file(client, token, file_id).await {
Ok(path) => (caption, Some(path)),
Err(e) => {
error!("telegram photo download failed (file_id={file_id}): {e}");
// Surface what we have: caption plus a marker that
// a photo was sent but couldn't be fetched.
let marker = format!("[image: download failed: {e}]");
let combined = if caption.is_empty() {
marker
} else {
format!("{marker}\n{caption}")
};
(combined, None)
}
}
}
} else if let Some(text) = msg["text"].as_str() {
(text.to_string(), None)
} else {
// Other media types (voice, video, sticker, etc.) — skip for now,
// but log so we can extend later.
let kind = ["voice", "video", "sticker", "document", "audio", "animation"]
.iter()
.find(|k| !msg[**k].is_null())
.copied()
.unwrap_or("unknown");
info!("telegram: skipping non-text/photo message (kind={kind}, update_id={update_id})");
continue;
};
messages.push(TelegramMessage { messages.push(TelegramMessage {
update_id, update_id,
chat_id, chat_id,
sender, sender,
text: text.to_string(), text,
media_path,
}); });
} }
} }
}
Ok(messages) Ok(messages)
} }
/// Resolve a Telegram file_id to a downloadable URL path via getFile.
async fn get_file_path(
client: &HttpClient,
token: &str,
file_id: &str,
) -> Result<String, Box<dyn std::error::Error>> {
let url = format!(
"https://api.telegram.org/bot{}/getFile?file_id={}",
token, file_id,
);
let response = client.get(&url).await?;
let body = response.text().await?;
let resp: serde_json::Value = serde_json::from_str(&body)
.map_err(|e| format!("getFile JSON parse error: {e}"))?;
if !resp["ok"].as_bool().unwrap_or(false) {
return Err(format!("getFile failed: {}", resp["description"].as_str().unwrap_or("?")).into());
}
let file_path = resp["result"]["file_path"].as_str()
.ok_or("getFile: missing result.file_path")?;
Ok(file_path.to_string())
}
/// Download a Telegram file by file_id into the channel media dir.
/// Returns the absolute local path on success.
async fn download_telegram_file(
client: &HttpClient,
token: &str,
file_id: &str,
) -> Result<String, Box<dyn std::error::Error>> {
let file_path = get_file_path(client, token, file_id).await?;
let url = format!("https://api.telegram.org/file/bot{}/{}", token, file_path);
let response = client.get(&url).await?;
let status = response.status();
if !status.is_success() {
return Err(format!("file download failed: {status}").into());
}
let bytes = response.bytes().await?;
let ext = file_path.rsplit('.').next().filter(|e| !e.contains('/')).unwrap_or("dat");
let media_dir = log_dir().join("media");
std::fs::create_dir_all(&media_dir)?;
let dest = media_dir.join(format!("{file_id}.{ext}"));
std::fs::write(&dest, &bytes)?;
Ok(dest.to_string_lossy().to_string())
}
/// Send a text message to a Telegram chat. /// Send a text message to a Telegram chat.
async fn send_message( async fn send_message(
client: &HttpClient, client: &HttpClient,
@ -369,11 +459,19 @@ async fn poll_once(
let sender_lower = msg.sender.to_lowercase(); let sender_lower = msg.sender.to_lowercase();
let channel = format!("telegram.{}", sender_lower); let channel = format!("telegram.{}", sender_lower);
channel_log::append_disk_log(&log_dir(), &sender_lower, &msg.sender, &msg.text); // If the message has media, prepend an [image: <abs_path>] marker
// so the multimodal model can Read the file directly.
let body = match &msg.media_path {
Some(path) if msg.text.is_empty() => format!("[image: {path}]"),
Some(path) => format!("[image: {path}]\n{}", msg.text),
None => msg.text.clone(),
};
channel_log::append_disk_log(&log_dir(), &sender_lower, &msg.sender, &body);
let mut s = state.borrow_mut(); let mut s = state.borrow_mut();
s.config.chat_ids.insert(sender_lower, msg.chat_id); s.config.chat_ids.insert(sender_lower, msg.chat_id);
let line = format!("[{}] {}", msg.sender, msg.text); let line = format!("[{}] {}", msg.sender, body);
s.push_message(line, 2, &channel); s.push_message(line, 2, &channel);
} }