From b34d7cdcdd79fcf6997ef6c04f93c43c4fb36d60 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 22:48:45 +0200 Subject: [PATCH] implement `allowed_tags` config option, format config, update documentation comments --- crates/crawler/config.toml | 84 +++++++++++++++++++++++++++--------- crates/crawler/src/config.rs | 13 +++--- crates/crawler/src/main.rs | 67 ++++++++++++++++------------ 3 files changed, 110 insertions(+), 54 deletions(-) diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml index ad50346..50f5a7e 100644 --- a/crates/crawler/config.toml +++ b/crates/crawler/config.toml @@ -4,29 +4,71 @@ update = 900 # Database connection setup # * see crates/mysql/database [mysql] -host = "localhost" -port = 3306 -username = "" -password = "" -database = "rssto" + + host = "localhost" + port = 3306 + username = "" + password = "" + database = "rssto" # Content sources (unlimited) [[channel]] -url = "https://" -items_limit = 20 -persist_item_title = true -persist_item_description = true -# optional: -# content_title_selector = "h1" -# content_description_selector = "article" -# persist_images_selector = "img" + + # RSS feed source + url = "https://1" + + # Limit latest channel items to crawl (unlimited by default) + items_limit = 20 + + # Save Channel item title in the database (currently not in use) + persist_item_title = true + + #Save Channel item description in the database (currently not in use) + persist_item_description = true + + # Allowed tags + # * empty to strip all tags (default) + allowed_tags = [] + + # Scrape title by CSS selector + # * None to use Channel item title if exists or fail to continue + # content_title_selector = "h1" + + # Scrape description by CSS selector + # * None to use Channel item description if exists or fail to continue + # content_description_selector = "article" + + # Preload content images locally if `Some` + # * currently stored in the database + # persist_images_selector = "img" + [[channel]] -url = "https://" -items_limit = 20 -persist_item_title = true -persist_item_description = true -# optional: -# content_title_selector = "h1" -# content_description_selector = "article" -# persist_images_selector = "img" \ No newline at end of file + + # RSS feed source + url = "https://2" + + # Limit latest channel items to crawl (unlimited by default) + items_limit = 20 + + # Save Channel item title in the database (currently not in use) + persist_item_title = true + + #Save Channel item description in the database (currently not in use) + persist_item_description = true + + # Allowed tags + # * empty to strip all tags (default) + allowed_tags = [] + + # Scrape title by CSS selector + # * None to use Channel item title if exists or fail to continue + # content_title_selector = "h1" + + # Scrape description by CSS selector + # * None to use Channel item description if exists or fail to continue + # content_description_selector = "article" + + # Preload content images locally if `Some` + # * currently stored in the database + # persist_images_selector = "img" diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs index b4734cc..cf2881b 100644 --- a/crates/crawler/src/config.rs +++ b/crates/crawler/src/config.rs @@ -15,18 +15,21 @@ pub struct Mysql { pub struct Channel { /// RSS feed source pub url: Url, - /// Limit channel items (unlimited by default) + /// Limit latest channel items to crawl (unlimited by default) pub items_limit: Option, - /// Save item title + /// Save Channel item title in the database (currently not in use) pub persist_item_title: bool, - /// Save item description + /// Save Channel item description in the database (currently not in use) pub persist_item_description: bool, /// Scrape title by CSS selector - /// * None to ignore + /// * None to use Channel item title if exists or fail to continue pub content_title_selector: Option, /// Scrape description by CSS selector - /// * None to ignore + /// * None to use Channel item description if exists or fail to continue pub content_description_selector: Option, + /// Allowed tags + /// * empty to strip all tags (default) + pub allowed_tags: std::collections::HashSet, /// Preload content images locally if `Some` /// * currently stored in the database pub persist_images_selector: Option, diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 8d53155..110092b 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -60,11 +60,12 @@ fn main() -> Result<()> { } fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> { - use ammonia::clean; + use std::collections::HashSet; - fn strip_tags(html: &str) -> String { + /// Removes all tags from `html` excluding `allowed_tags` or all if None + fn strip_tags(html: &str, allowed_tags: Option<&HashSet>) -> String { ammonia::Builder::new() - .tags(std::collections::HashSet::new()) + .tags(allowed_tags.map_or(HashSet::new(), |a| a.iter().map(|t| t.as_str()).collect())) .clean(html) .to_string() } @@ -113,12 +114,14 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul guid, link, if channel_config.persist_item_title { - channel_item.title().map(strip_tags) + channel_item.title().map(|s| strip_tags(s, None)) } else { None }, if channel_config.persist_item_description { - channel_item.description().map(clean) + channel_item + .description() + .map(|s| strip_tags(s, Some(&channel_config.allowed_tags))) } else { None }, @@ -126,35 +129,43 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul info!("Register new channel item #{channel_item_id} ({link})"); // preload remote content.. let html = scraper::Html::parse_document(&get(link)?.text()?); - let description = clean(&match channel_config.content_description_selector { - Some(ref selector) => match html.select(selector).next() { - Some(description) => description.inner_html(), - None => bail!("Could not scrape `description` selector from `{link}`"), + let description = strip_tags( + &match channel_config.content_description_selector { + Some(ref selector) => match html.select(selector).next() { + Some(description) => description.inner_html(), + None => bail!("Could not scrape `description` selector from `{link}`"), + }, + None => match channel_item.description { + Some(ref description) => description.clone(), + None => { + bail!("Could not assign `description` from channel item for `{link}`") + } + }, }, - None => match channel_item.description { - Some(ref description) => description.clone(), - None => { - bail!("Could not assign `description` from channel item for `{link}`") - } - }, - }); + Some(&channel_config.allowed_tags), + ); let content_id = tx.insert_content( channel_item_id, None, - strip_tags(&match channel_config.content_title_selector { - Some(ref selector) => match html.select(selector).next() { - Some(title) => title.inner_html(), - None => bail!("Could not scrape `title` selector from `{link}`"), + strip_tags( + &match channel_config.content_title_selector { + Some(ref selector) => match html.select(selector).next() { + Some(title) => title.inner_html(), + None => bail!("Could not scrape `title` selector from `{link}`"), + }, + None => match channel_item.title { + Some(ref title) => title.clone(), + None => { + bail!( + "Could not assign `title` from channel item for content in `{link}`" + ) + } + }, }, - None => match channel_item.title { - Some(ref title) => title.clone(), - None => { - bail!("Could not assign `title` from channel item for content in `{link}`") - } - }, - }) + None, + ) .trim(), - clean(&description).trim(), + description.trim(), )?; info!("Add new content record #{content_id}"); // persist images if enabled