From ec0cca64f3b02b8606d17ddc72840af4e47b7e48 Mon Sep 17 00:00:00 2001
From: yggverse <yggverse@project>
Date: Sat, 10 Jan 2026 14:38:01 +0200
Subject: [PATCH] implement `persist_images_selector`, minimize codebase by
 using `bail`, change image table structure to use sha256 hash as the unique
 image identity

---
 crates/crawler/Cargo.toml       |   1 +
 crates/crawler/config.toml      |   4 +-
 crates/crawler/src/config.rs    |   3 +
 crates/crawler/src/main.rs      | 141 +++++++++++++++-----------------
 crates/mysql/database/0.1.0.sql |   8 +-
 crates/mysql/src/table.rs       |   8 +-
 crates/mysql/src/transaction.rs |  22 +++--
 7 files changed, 97 insertions(+), 90 deletions(-)
diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml
index d531744..6e55b06 100644
--- a/crates/crawler/Cargo.toml
+++ b/crates/crawler/Cargo.toml
@@ -19,6 +19,7 @@ reqwest = { version = "0.13.1", features = ["blocking"] }
 rss = "2.0.12"
 scraper = { version = "0.25.0", features = ["serde"] }
 serde = { version = "1.0.228", features = ["derive"] }
+sha2 = "0.10.9"
 toml = "0.9.10"
 tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
 url = { version = "2.5.8", features = ["serde"] }
\ No newline at end of file
diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml
index bde12ba..3232c16 100644
--- a/crates/crawler/config.toml
+++ b/crates/crawler/config.toml
@@ -19,6 +19,7 @@ persist_item_description = true
 # optional:
 # content_title_selector = "h1"
 # content_description_selector = "article"
+# persist_images_selector = "img"
 
 [[channel]]
 url = "https://"
@@ -27,4 +28,5 @@ persist_item_title = true
 persist_item_description = true
 # optional:
 # content_title_selector = "h1"
-# content_description_selector = "article"
\ No newline at end of file
+# content_description_selector = "article"
+# persist_images_selector = "img"
\ No newline at end of file
diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs
index dc325b5..701c6e4 100644
--- a/crates/crawler/src/config.rs
+++ b/crates/crawler/src/config.rs
@@ -27,6 +27,9 @@ pub struct Channel {
     /// Scrape description by CSS selector
     /// * None to ignore
     pub content_description_selector: Option<Selector>,
+    /// Preload content images locally if `Some`
+    /// * currently stored in the database
+    pub persist_images_selector: Option<Selector>,
 }
 
 #[derive(Debug, Deserialize)]
diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs
index 0d0867a..0767499 100644
--- a/crates/crawler/src/main.rs
+++ b/crates/crawler/src/main.rs
@@ -1,9 +1,10 @@
 mod argument;
 mod config;
 
-use anyhow::Result;
+use anyhow::{Result, bail};
 use log::{debug, info, warn};
 use reqwest::blocking::get;
+use url::Url;
 
 fn main() -> Result<()> {
     use chrono::Local;
@@ -59,64 +60,40 @@ fn main() -> Result<()> {
 }
 
 fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
-    use rss::Channel;
-    use scraper::Selector;
-
-    /// local helper
-    fn scrape(url: &str, selector: &Selector) -> Result<Option<String>> {
-        let document = scraper::Html::parse_document(&get(url)?.text()?);
-        Ok(if let Some(first) = document.select(selector).next() {
-            Some(first.inner_html())
-        } else {
-            warn!("Could not scrape requested inner");
-            None
-        })
-    }
-
     let channel_url = channel_config.url.to_string(); // allocate once
 
-    let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
-        Ok(response) => response.into_items(),
-        Err(e) => {
-            warn!("Could not parse response from `{channel_url}`: `{e}`");
-            return Ok(());
-        }
-    };
+    let channel_items =
+        match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
+            Ok(response) => response.into_items(),
+            Err(e) => bail!("Could not parse response: `{e}`"),
+        };
 
     let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
 
     let channel_id = match tx.channel_id_by_url(&channel_url)? {
         Some(channel_id) => channel_id,
-        None => tx.insert_channel(&channel_url)?,
+        None => {
+            let channel_id = tx.insert_channel(&channel_url)?;
+            info!("Register new channel #{channel_id} ({channel_url})");
+            channel_id
+        }
     };
 
     for channel_item in channel_items.iter().take(channel_items_limit) {
         let guid = match channel_item.guid {
             Some(ref guid) => guid.value.as_ref(),
-            None => {
-                warn!("Undefined `guid` field in `{channel_url}`");
-                continue;
-            }
+            None => bail!("Undefined `guid` field"),
         };
-        let link = match channel_item.link {
-            Some(ref link) => link,
-            None => {
-                warn!("Undefined `link` field in `{channel_url}`");
-                continue;
-            }
+        let (link, base) = match channel_item.link {
+            Some(ref link) => (link, Url::parse(link)?),
+            None => bail!("Undefined `link` field"),
         };
         let pub_date = match channel_item.pub_date {
             Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
                 Ok(t) => t.timestamp(),
-                Err(e) => {
-                    warn!("Invalid `pub_date` field in `{channel_url}`: `{e}`");
-                    continue;
-                }
+                Err(e) => bail!("Invalid `pub_date` field: `{e}`"),
             },
-            None => {
-                warn!("Undefined `pub_date` field in `{channel_url}`");
-                continue;
-            }
+            None => bail!("Undefined `pub_date`"),
         };
         if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 {
             continue; // skip next steps as processed
@@ -137,57 +114,67 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
                 None
             },
         )?;
+        info!("Register new channel item #{channel_item_id} ({link})");
         // preload remote content..
+        let html = scraper::Html::parse_document(&get(link)?.text()?);
         let title = match channel_config.content_title_selector {
-            Some(ref selector) => match scrape(link, selector) {
-                Ok(value) => match value {
-                    Some(title) => title,
-                    None => {
-                        warn!("Could not scrape `title` selector in `{channel_url}`");
-                        continue;
-                    }
-                },
-                Err(e) => {
-                    warn!("Could not update `title` selector in `{channel_url}`: `{e}`");
-                    continue;
-                }
+            Some(ref selector) => match html.select(selector).next() {
+                Some(title) => title.inner_html(),
+                None => bail!("Could not scrape `title` selector from `{link}`"),
             },
             None => match channel_item.title {
                 Some(ref title) => title.clone(),
-                None => {
-                    warn!(
-                        "Could not assign `title` from channel item for content in `{channel_url}`"
-                    );
-                    continue;
-                }
+                None => bail!("Could not assign `title` from channel item for content in `{link}`"),
             },
         };
         let description = match channel_config.content_description_selector {
-            Some(ref selector) => match scrape(link, selector) {
-                Ok(value) => match value {
-                    Some(description) => description,
-                    None => {
-                        warn!("Could not scrape `description` selector in `{channel_url}`");
-                        continue;
-                    }
-                },
-                Err(e) => {
-                    warn!("Could not update `description` selector in `{channel_url}`: `{e}`");
-                    continue;
-                }
+            Some(ref selector) => match html.select(selector).next() {
+                Some(description) => description.inner_html(),
+                None => bail!("Could not scrape `description` selector from `{link}`"),
             },
             None => match channel_item.description {
                 Some(ref description) => description.clone(),
                 None => {
-                    warn!(
-                        "Could not assign `description` from channel item for content in `{channel_url}`"
-                    );
-                    continue;
+                    bail!("Could not assign `description` from channel item for `{link}`")
                 }
             },
         };
-        let _content_id = tx.insert_content(channel_item_id, None, &title, &description)?;
-        // @TODO preload media
+        let content_id = tx.insert_content(channel_item_id, None, &title, &description)?;
+        info!("Add new content record #{content_id} ({title})");
+        // persist images if enabled
+        if let Some(ref selector) = channel_config.persist_images_selector {
+            use sha2::{Digest, Sha256};
+            for element in scraper::Html::parse_document(&description).select(selector) {
+                if let Some(src) = element.value().attr("src") {
+                    let absolute = match Url::parse(src) {
+                        Ok(url) => url,
+                        Err(e) => {
+                            if e == url::ParseError::RelativeUrlWithoutBase {
+                                let absolute = base.join(link)?;
+                                debug!("Convert relative image link `{link}` to `{absolute}`");
+                                absolute
+                            } else {
+                                bail!("Could not parse URL from img source: `{e}`")
+                            }
+                        }
+                    };
+                    let url = absolute.as_str();
+                    let data = get(url)?.bytes()?;
+                    let hash = format!("{:x}", Sha256::digest(&data));
+
+                    let image_id = match tx.image_id_by_sha256(&hash)? {
+                        Some(image_id) => image_id,
+                        None => {
+                            let image_id = tx.insert_image(&hash, Some(src), Some(url), &data)?;
+                            info!("Persist new image #{image_id} (`{absolute}`)");
+                            image_id
+                        }
+                    };
+                    let content_image_id = tx.insert_content_image(content_id, image_id)?;
+                    debug!("Add content image relationship #{content_image_id}")
+                }
+            }
+        }
     }
     Ok(())
 }
diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql
index 6c318f2..9524e12 100644
--- a/crates/mysql/database/0.1.0.sql
+++ b/crates/mysql/database/0.1.0.sql
@@ -1,5 +1,5 @@
 -- MySQL Script generated by MySQL Workbench
--- пт, 09-січ-2026 17:57:03 +0200
+-- сб, 10-січ-2026 14:27:50 +0200
 -- Model: New Model    Version: 1.0
 -- MySQL Workbench Forward Engineering
 
@@ -92,10 +92,12 @@ ENGINE = InnoDB;
 -- -----------------------------------------------------
 CREATE TABLE IF NOT EXISTS `rssto`.`image` (
   `image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
-  `source` VARCHAR(2048) NOT NULL,
+  `sha256` CHAR(64) NOT NULL,
+  `src` VARCHAR(2048) NULL,
+  `url` VARCHAR(2048) NULL,
   `data` MEDIUMBLOB NOT NULL,
   PRIMARY KEY (`image_id`),
-  UNIQUE INDEX `source_UNIQUE` (`source` ASC) VISIBLE)
+  UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE)
 ENGINE = InnoDB;
 
 
diff --git a/crates/mysql/src/table.rs b/crates/mysql/src/table.rs
index 3ee92ce..631bc37 100644
--- a/crates/mysql/src/table.rs
+++ b/crates/mysql/src/table.rs
@@ -37,7 +37,13 @@ pub struct Provider {
 #[derive(Debug, PartialEq, Eq, FromRow)]
 pub struct Image {
     pub image_id: u64,
-    pub source: String,
+    /// Keep image unique by comparing its data hash
+    pub sha256: String,
+    /// Original `src` tag value to post-replacing
+    pub src: Option<String>,
+    /// Resolved absolute URL
+    pub url: Option<String>,
+    /// Image data, MEDIUMBLOB (16M)
     pub data: Vec<u8>,
 }
 
diff --git a/crates/mysql/src/transaction.rs b/crates/mysql/src/transaction.rs
index a39e290..919b56b 100644
--- a/crates/mysql/src/transaction.rs
+++ b/crates/mysql/src/transaction.rs
@@ -115,17 +115,23 @@ impl Transaction {
         Ok(self.tx.last_insert_id().unwrap())
     }
 
-    pub fn images_total_by_source(&mut self, source: &str) -> Result<usize, Error> {
-        Ok(self
-            .tx
-            .exec_first("SELECT COUNT(*) FROM `image` WHERE `source` = ?", (source,))?
-            .unwrap_or(0))
+    pub fn image_id_by_sha256(&mut self, sha256: &str) -> Result<Option<u64>, Error> {
+        self.tx.exec_first(
+            "SELECT `image_id` FROM `image` WHERE `sha256` = ? LIMIT 1",
+            (sha256,),
+        )
     }
 
-    pub fn insert_image(&mut self, source: &str, data: &[u8]) -> Result<u64, Error> {
+    pub fn insert_image(
+        &mut self,
+        sha256: &str,
+        src: Option<&str>,
+        url: Option<&str>,
+        data: &[u8],
+    ) -> Result<u64, Error> {
         self.tx.exec_drop(
-            "INSERT INTO `image` SET `source` = ?, `data` = ?",
-            (source, data),
+            "INSERT INTO `image` SET `sha256` = ?, `src` = ?, `url` = ?, `data` = ?",
+            (sha256, src, url, data),
         )?;
         Ok(self.tx.last_insert_id().unwrap())
     }