177 lines
		
	
	
	
		
			7 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
			
		
		
	
	
			177 lines
		
	
	
	
		
			7 KiB
		
	
	
	
		
			Rust
		
	
	
	
	
	
//! # The Embed Module
 | 
						|
//!
 | 
						|
//! This module controls the embed functionality of frogbot.
 | 
						|
 | 
						|
use lazy_static::lazy_static;
 | 
						|
use log::warn;
 | 
						|
use matrix_sdk::{
 | 
						|
    room::Room,
 | 
						|
    ruma::events::room::message::{
 | 
						|
        MessageType, OriginalSyncRoomMessageEvent, Relation, RoomMessageEventContent,
 | 
						|
    },
 | 
						|
    Client,
 | 
						|
};
 | 
						|
use regex::Regex;
 | 
						|
use scraper::{Html, Selector};
 | 
						|
 | 
						|
/// Represents an Embed in the chat
 | 
						|
#[derive(Default)]
 | 
						|
pub struct Embed {
 | 
						|
    /// The title of the embed
 | 
						|
    pub title: String,
 | 
						|
    /// The description
 | 
						|
    pub description: String,
 | 
						|
}
 | 
						|
 | 
						|
impl Embed {
 | 
						|
    /// Creates a new [`Embed`].
 | 
						|
    pub fn new(title: String, description: String) -> Embed {
 | 
						|
        Embed { title, description }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
/// Scrapes the HTML of a webpage and generates an [`Embed`] with the scraped information.
 | 
						|
pub fn parse_metadata(page: &str) -> Option<Embed> {
 | 
						|
    let doc_body = Html::parse_document(page);
 | 
						|
 | 
						|
    // Selectors used to get metadata are defined here
 | 
						|
    let title_selector = Selector::parse("title").unwrap();
 | 
						|
    let description_selector = Selector::parse("meta[name=\"description\"]").unwrap();
 | 
						|
 | 
						|
    // Grab the actual data
 | 
						|
    let title = doc_body.select(&title_selector).next();
 | 
						|
    let desc = doc_body.select(&description_selector).next();
 | 
						|
    // Clean up meta info and store it as a string
 | 
						|
    let mut meta_title = String::default();
 | 
						|
    let mut meta_description = String::default();
 | 
						|
 | 
						|
    if let (None, None) = (title, desc) {
 | 
						|
        warn!("Couldn't parse any metadata for URL");
 | 
						|
        return None;
 | 
						|
    }
 | 
						|
 | 
						|
    if let Some(title) = title {
 | 
						|
        meta_title = title.text().collect();
 | 
						|
    } else {
 | 
						|
        warn!("Failed to parse title HTML");
 | 
						|
    }
 | 
						|
 | 
						|
    if let Some(desc) = desc {
 | 
						|
        meta_description = desc.value().attr("content").unwrap().to_string();
 | 
						|
    } else {
 | 
						|
        warn!("Failed to parse description HTML");
 | 
						|
    }
 | 
						|
 | 
						|
    Some(Embed::new(meta_title, meta_description))
 | 
						|
}
 | 
						|
 | 
						|
/// Check if the message has any urls in it and get them if it does
 | 
						|
fn get_urls_from_message(message: &str) -> Vec<&str> {
 | 
						|
    // Using lazy static magic here, so this means the regex is compiled exactly once
 | 
						|
    // After initial compile it gets reused instead of recompiling on every message event
 | 
						|
    lazy_static! {
 | 
						|
        // shamelessly stolen and modified from some garbage blog online
 | 
						|
        // I have no fucking idea how this works - https://urlregex.com/
 | 
						|
        static ref RE: Regex = Regex::new(r"(?:(?:https?)://)(?:\S+(?::\S*)?@|\d{1,3}(?:\.\d{1,3}){3}|(?:(?:[a-z\d\x{00a1}-\x{ffff}]+-?)*[a-z\d\x{00a1}-\x{ffff}]+)(?:\.(?:[a-z\d\x{00a1}-\x{ffff}]+-?)*[a-z\d\x{00a1}-\x{ffff}]+)*(?:\.[a-z\x{00a1}-\x{ffff}]{2,6}))(?::\d+)?(?:[^\s]*)?").unwrap();
 | 
						|
    }
 | 
						|
 | 
						|
    // This will hold all the urls in the message if any are found
 | 
						|
    let mut urls: Vec<&str> = vec![];
 | 
						|
 | 
						|
    if RE.is_match(message) {
 | 
						|
        // If we find any urls, push them into the urls vec
 | 
						|
        for regex_match in RE.find_iter(message) {
 | 
						|
            // If the url points to localhost, we don't want to embed it, so we ignore it
 | 
						|
            if regex_match.as_str().to_lowercase().contains("localhost")
 | 
						|
                || regex_match.as_str().to_lowercase().contains("127.0.0.1")
 | 
						|
            {
 | 
						|
                warn!("This is probably a malicious URL, ignoring!");
 | 
						|
            } else {
 | 
						|
                warn!("Found {}", ®ex_match.as_str());
 | 
						|
                urls.push(regex_match.as_str());
 | 
						|
            }
 | 
						|
        }
 | 
						|
    } else {
 | 
						|
        // If we don't find any urls, do nothing
 | 
						|
    };
 | 
						|
    urls
 | 
						|
}
 | 
						|
 | 
						|
/// Checks messages for valid links and generates embeds if found
 | 
						|
pub async fn embed_handler(event: OriginalSyncRoomMessageEvent, room: Room, client: Client) {
 | 
						|
    if let Room::Joined(room) = room {
 | 
						|
        let full_reply_event = event.clone().into_full_event(room.room_id().to_owned());
 | 
						|
 | 
						|
        // If the sender ID matches our client, ignore the message
 | 
						|
        // We don't want to reply to ourselves
 | 
						|
        let client_user_id = client.user_id().unwrap();
 | 
						|
        if event.sender == client_user_id {
 | 
						|
            return;
 | 
						|
        }
 | 
						|
 | 
						|
        // Do not make an embed if someone replies to a URL
 | 
						|
        // Unfortunately, this makes it so that if your reply has a URL, it will not embed.
 | 
						|
        // TODO: Fix this by scanning replies and only generating embeds for new URLs in future.
 | 
						|
        if let Some(Relation::Reply { in_reply_to: _ }) = &event.content.relates_to {
 | 
						|
            warn!("Ignoring message, it's a reply to someone else!");
 | 
						|
            return;
 | 
						|
        }
 | 
						|
 | 
						|
        // Ignore anything that isn't text
 | 
						|
        let MessageType::Text(text_content) = event.content.msgtype else {
 | 
						|
            warn!("Ignoring message, content is not plaintext!");
 | 
						|
            return;
 | 
						|
        };
 | 
						|
 | 
						|
        let urls = get_urls_from_message(&text_content.body);
 | 
						|
 | 
						|
        let reqwest_client = reqwest::Client::builder().user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36").build().unwrap();
 | 
						|
 | 
						|
        for url in urls {
 | 
						|
            if let Ok(req) = reqwest_client.get(url).send().await {
 | 
						|
                if let Ok(res) = req.text().await {
 | 
						|
                    // beware, dirty HTML parsing code
 | 
						|
                    let metadata = parse_metadata(&res);
 | 
						|
 | 
						|
                    // Build and send our message reply
 | 
						|
                    if metadata.is_some() {
 | 
						|
                        let embed = metadata.unwrap();
 | 
						|
                        let bot_reply = RoomMessageEventContent::text_html(
 | 
						|
                            &embed.title,
 | 
						|
                            format!(
 | 
						|
                                "<blockquote>
 | 
						|
                                <h4>{}</h4>
 | 
						|
                                <p>{}</p>
 | 
						|
                                </blockquote>",
 | 
						|
                                &embed.title, &embed.description
 | 
						|
                            ),
 | 
						|
                        )
 | 
						|
                        .make_reply_to(&full_reply_event);
 | 
						|
 | 
						|
                        // Finally send the reply to the room
 | 
						|
                        warn!("Sending embed for URL: '{}'", &url);
 | 
						|
                        if room.send(bot_reply, None).await.is_err() {
 | 
						|
                            warn!("Failed to send embed for URL: '{}'", &url);
 | 
						|
                        }
 | 
						|
                    // If we didn't get any metadata send a generic "No metadata" response
 | 
						|
                    } else {
 | 
						|
                        let bot_reply = RoomMessageEventContent::text_html(
 | 
						|
                            "Couldn't parse metadata for URL",
 | 
						|
                            "<blockquote><h5>Couldn't parse metadata for URL</h5></blockquote>",
 | 
						|
                        )
 | 
						|
                        .make_reply_to(&full_reply_event);
 | 
						|
                        // Send the reply to the room
 | 
						|
                        warn!("Sending 'No metadata' embed for URL: '{}'", &url);
 | 
						|
                        if room.send(bot_reply, None).await.is_err() {
 | 
						|
                            warn!("Failed to send embed for URL: '{}'", &url);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                } else {
 | 
						|
                    warn!("Failed to parse HTML for URL: '{}'", &url);
 | 
						|
                }
 | 
						|
            } else {
 | 
						|
                warn!("Failed to fetch metadata for '{}'", &url);
 | 
						|
            }
 | 
						|
        }
 | 
						|
    };
 | 
						|
}
 |