From b34d7cdcdd79fcf6997ef6c04f93c43c4fb36d60 Mon Sep 17 00:00:00 2001
From: yggverse <yggverse@project>
Date: Sat, 10 Jan 2026 22:48:45 +0200
Subject: [PATCH] implement `allowed_tags` config option, format config, update
 documentation comments

---
 crates/crawler/config.toml   | 84 +++++++++++++++++++++++++++---------
 crates/crawler/src/config.rs | 13 +++---
 crates/crawler/src/main.rs   | 67 ++++++++++++++++------------
 3 files changed, 110 insertions(+), 54 deletions(-)
diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml
index ad50346..50f5a7e 100644
--- a/crates/crawler/config.toml
+++ b/crates/crawler/config.toml
@@ -4,29 +4,71 @@ update = 900
 # Database connection setup
 # * see crates/mysql/database
 [mysql]
-host = "localhost"
-port = 3306
-username = ""
-password = ""
-database = "rssto"
+
+    host = "localhost"
+    port = 3306
+    username = ""
+    password = ""
+    database = "rssto"
 
 # Content sources (unlimited)
 [[channel]]
-url = "https://"
-items_limit = 20
-persist_item_title = true
-persist_item_description = true
-# optional:
-# content_title_selector = "h1"
-# content_description_selector = "article"
-# persist_images_selector = "img"
+
+    # RSS feed source
+    url = "https://1"
+
+    # Limit latest channel items to crawl (unlimited by default)
+    items_limit = 20
+
+    # Save Channel item title in the database (currently not in use)
+    persist_item_title = true
+
+    #Save Channel item description in the database (currently not in use)
+    persist_item_description = true
+
+    # Allowed tags
+    # * empty to strip all tags (default)
+    allowed_tags = []
+
+    # Scrape title by CSS selector
+    # * None to use Channel item title if exists or fail to continue
+    # content_title_selector = "h1"
+
+    #  Scrape description by CSS selector
+    #  * None to use Channel item description if exists or fail to continue
+    # content_description_selector = "article"
+
+    # Preload content images locally if `Some`
+    # * currently stored in the database
+    # persist_images_selector = "img"
+
 
 [[channel]]
-url = "https://"
-items_limit = 20
-persist_item_title = true
-persist_item_description = true
-# optional:
-# content_title_selector = "h1"
-# content_description_selector = "article"
-# persist_images_selector = "img"
\ No newline at end of file
+
+    # RSS feed source
+    url = "https://2"
+
+    # Limit latest channel items to crawl (unlimited by default)
+    items_limit = 20
+
+    # Save Channel item title in the database (currently not in use)
+    persist_item_title = true
+
+    #Save Channel item description in the database (currently not in use)
+    persist_item_description = true
+
+    # Allowed tags
+    # * empty to strip all tags (default)
+    allowed_tags = []
+
+    # Scrape title by CSS selector
+    # * None to use Channel item title if exists or fail to continue
+    # content_title_selector = "h1"
+
+    #  Scrape description by CSS selector
+    #  * None to use Channel item description if exists or fail to continue
+    # content_description_selector = "article"
+
+    # Preload content images locally if `Some`
+    # * currently stored in the database
+    # persist_images_selector = "img"
diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs
index b4734cc..cf2881b 100644
--- a/crates/crawler/src/config.rs
+++ b/crates/crawler/src/config.rs
@@ -15,18 +15,21 @@ pub struct Mysql {
 pub struct Channel {
     /// RSS feed source
     pub url: Url,
-    /// Limit channel items (unlimited by default)
+    /// Limit latest channel items to crawl (unlimited by default)
     pub items_limit: Option<usize>,
-    /// Save item title
+    /// Save Channel item title in the database (currently not in use)
     pub persist_item_title: bool,
-    /// Save item description
+    /// Save Channel item description in the database (currently not in use)
     pub persist_item_description: bool,
     /// Scrape title by CSS selector
-    /// * None to ignore
+    /// * None to use Channel item title if exists or fail to continue
     pub content_title_selector: Option<Selector>,
     /// Scrape description by CSS selector
-    /// * None to ignore
+    /// * None to use Channel item description if exists or fail to continue
     pub content_description_selector: Option<Selector>,
+    /// Allowed tags
+    /// * empty to strip all tags (default)
+    pub allowed_tags: std::collections::HashSet<String>,
     /// Preload content images locally if `Some`
     /// * currently stored in the database
     pub persist_images_selector: Option<Selector>,
diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs
index 8d53155..110092b 100644
--- a/crates/crawler/src/main.rs
+++ b/crates/crawler/src/main.rs
@@ -60,11 +60,12 @@ fn main() -> Result<()> {
 }
 
 fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
-    use ammonia::clean;
+    use std::collections::HashSet;
 
-    fn strip_tags(html: &str) -> String {
+    /// Removes all tags from `html` excluding `allowed_tags` or all if None
+    fn strip_tags(html: &str, allowed_tags: Option<&HashSet<String>>) -> String {
         ammonia::Builder::new()
-            .tags(std::collections::HashSet::new())
+            .tags(allowed_tags.map_or(HashSet::new(), |a| a.iter().map(|t| t.as_str()).collect()))
             .clean(html)
             .to_string()
     }
@@ -113,12 +114,14 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
             guid,
             link,
             if channel_config.persist_item_title {
-                channel_item.title().map(strip_tags)
+                channel_item.title().map(|s| strip_tags(s, None))
             } else {
                 None
             },
             if channel_config.persist_item_description {
-                channel_item.description().map(clean)
+                channel_item
+                    .description()
+                    .map(|s| strip_tags(s, Some(&channel_config.allowed_tags)))
             } else {
                 None
             },
@@ -126,35 +129,43 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
         info!("Register new channel item #{channel_item_id} ({link})");
         // preload remote content..
         let html = scraper::Html::parse_document(&get(link)?.text()?);
-        let description = clean(&match channel_config.content_description_selector {
-            Some(ref selector) => match html.select(selector).next() {
-                Some(description) => description.inner_html(),
-                None => bail!("Could not scrape `description` selector from `{link}`"),
+        let description = strip_tags(
+            &match channel_config.content_description_selector {
+                Some(ref selector) => match html.select(selector).next() {
+                    Some(description) => description.inner_html(),
+                    None => bail!("Could not scrape `description` selector from `{link}`"),
+                },
+                None => match channel_item.description {
+                    Some(ref description) => description.clone(),
+                    None => {
+                        bail!("Could not assign `description` from channel item for `{link}`")
+                    }
+                },
             },
-            None => match channel_item.description {
-                Some(ref description) => description.clone(),
-                None => {
-                    bail!("Could not assign `description` from channel item for `{link}`")
-                }
-            },
-        });
+            Some(&channel_config.allowed_tags),
+        );
         let content_id = tx.insert_content(
             channel_item_id,
             None,
-            strip_tags(&match channel_config.content_title_selector {
-                Some(ref selector) => match html.select(selector).next() {
-                    Some(title) => title.inner_html(),
-                    None => bail!("Could not scrape `title` selector from `{link}`"),
+            strip_tags(
+                &match channel_config.content_title_selector {
+                    Some(ref selector) => match html.select(selector).next() {
+                        Some(title) => title.inner_html(),
+                        None => bail!("Could not scrape `title` selector from `{link}`"),
+                    },
+                    None => match channel_item.title {
+                        Some(ref title) => title.clone(),
+                        None => {
+                            bail!(
+                                "Could not assign `title` from channel item for content in `{link}`"
+                            )
+                        }
+                    },
                 },
-                None => match channel_item.title {
-                    Some(ref title) => title.clone(),
-                    None => {
-                        bail!("Could not assign `title` from channel item for content in `{link}`")
-                    }
-                },
-            })
+                None,
+            )
             .trim(),
-            clean(&description).trim(),
+            description.trim(),
         )?;
         info!("Add new content record #{content_id}");
         // persist images if enabled