From ec0cca64f3b02b8606d17ddc72840af4e47b7e48 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 14:38:01 +0200 Subject: [PATCH] implement `persist_images_selector`, minimize codebase by using `bail`, change image table structure to use sha256 hash as the unique image identity --- crates/crawler/Cargo.toml | 1 + crates/crawler/config.toml | 4 +- crates/crawler/src/config.rs | 3 + crates/crawler/src/main.rs | 141 +++++++++++++++----------------- crates/mysql/database/0.1.0.sql | 8 +- crates/mysql/src/table.rs | 8 +- crates/mysql/src/transaction.rs | 22 +++-- 7 files changed, 97 insertions(+), 90 deletions(-) diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml index d531744..6e55b06 100644 --- a/crates/crawler/Cargo.toml +++ b/crates/crawler/Cargo.toml @@ -19,6 +19,7 @@ reqwest = { version = "0.13.1", features = ["blocking"] } rss = "2.0.12" scraper = { version = "0.25.0", features = ["serde"] } serde = { version = "1.0.228", features = ["derive"] } +sha2 = "0.10.9" toml = "0.9.10" tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } url = { version = "2.5.8", features = ["serde"] } \ No newline at end of file diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml index bde12ba..3232c16 100644 --- a/crates/crawler/config.toml +++ b/crates/crawler/config.toml @@ -19,6 +19,7 @@ persist_item_description = true # optional: # content_title_selector = "h1" # content_description_selector = "article" +# persist_images_selector = "img" [[channel]] url = "https://" @@ -27,4 +28,5 @@ persist_item_title = true persist_item_description = true # optional: # content_title_selector = "h1" -# content_description_selector = "article" \ No newline at end of file +# content_description_selector = "article" +# persist_images_selector = "img" \ No newline at end of file diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs index dc325b5..701c6e4 100644 --- a/crates/crawler/src/config.rs +++ b/crates/crawler/src/config.rs @@ -27,6 +27,9 @@ pub struct Channel { /// Scrape description by CSS selector /// * None to ignore pub content_description_selector: Option, + /// Preload content images locally if `Some` + /// * currently stored in the database + pub persist_images_selector: Option, } #[derive(Debug, Deserialize)] diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 0d0867a..0767499 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -1,9 +1,10 @@ mod argument; mod config; -use anyhow::Result; +use anyhow::{Result, bail}; use log::{debug, info, warn}; use reqwest::blocking::get; +use url::Url; fn main() -> Result<()> { use chrono::Local; @@ -59,64 +60,40 @@ fn main() -> Result<()> { } fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> { - use rss::Channel; - use scraper::Selector; - - /// local helper - fn scrape(url: &str, selector: &Selector) -> Result> { - let document = scraper::Html::parse_document(&get(url)?.text()?); - Ok(if let Some(first) = document.select(selector).next() { - Some(first.inner_html()) - } else { - warn!("Could not scrape requested inner"); - None - }) - } - let channel_url = channel_config.url.to_string(); // allocate once - let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { - Ok(response) => response.into_items(), - Err(e) => { - warn!("Could not parse response from `{channel_url}`: `{e}`"); - return Ok(()); - } - }; + let channel_items = + match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { + Ok(response) => response.into_items(), + Err(e) => bail!("Could not parse response: `{e}`"), + }; let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); let channel_id = match tx.channel_id_by_url(&channel_url)? { Some(channel_id) => channel_id, - None => tx.insert_channel(&channel_url)?, + None => { + let channel_id = tx.insert_channel(&channel_url)?; + info!("Register new channel #{channel_id} ({channel_url})"); + channel_id + } }; for channel_item in channel_items.iter().take(channel_items_limit) { let guid = match channel_item.guid { Some(ref guid) => guid.value.as_ref(), - None => { - warn!("Undefined `guid` field in `{channel_url}`"); - continue; - } + None => bail!("Undefined `guid` field"), }; - let link = match channel_item.link { - Some(ref link) => link, - None => { - warn!("Undefined `link` field in `{channel_url}`"); - continue; - } + let (link, base) = match channel_item.link { + Some(ref link) => (link, Url::parse(link)?), + None => bail!("Undefined `link` field"), }; let pub_date = match channel_item.pub_date { Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) { Ok(t) => t.timestamp(), - Err(e) => { - warn!("Invalid `pub_date` field in `{channel_url}`: `{e}`"); - continue; - } + Err(e) => bail!("Invalid `pub_date` field: `{e}`"), }, - None => { - warn!("Undefined `pub_date` field in `{channel_url}`"); - continue; - } + None => bail!("Undefined `pub_date`"), }; if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 { continue; // skip next steps as processed @@ -137,57 +114,67 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul None }, )?; + info!("Register new channel item #{channel_item_id} ({link})"); // preload remote content.. + let html = scraper::Html::parse_document(&get(link)?.text()?); let title = match channel_config.content_title_selector { - Some(ref selector) => match scrape(link, selector) { - Ok(value) => match value { - Some(title) => title, - None => { - warn!("Could not scrape `title` selector in `{channel_url}`"); - continue; - } - }, - Err(e) => { - warn!("Could not update `title` selector in `{channel_url}`: `{e}`"); - continue; - } + Some(ref selector) => match html.select(selector).next() { + Some(title) => title.inner_html(), + None => bail!("Could not scrape `title` selector from `{link}`"), }, None => match channel_item.title { Some(ref title) => title.clone(), - None => { - warn!( - "Could not assign `title` from channel item for content in `{channel_url}`" - ); - continue; - } + None => bail!("Could not assign `title` from channel item for content in `{link}`"), }, }; let description = match channel_config.content_description_selector { - Some(ref selector) => match scrape(link, selector) { - Ok(value) => match value { - Some(description) => description, - None => { - warn!("Could not scrape `description` selector in `{channel_url}`"); - continue; - } - }, - Err(e) => { - warn!("Could not update `description` selector in `{channel_url}`: `{e}`"); - continue; - } + Some(ref selector) => match html.select(selector).next() { + Some(description) => description.inner_html(), + None => bail!("Could not scrape `description` selector from `{link}`"), }, None => match channel_item.description { Some(ref description) => description.clone(), None => { - warn!( - "Could not assign `description` from channel item for content in `{channel_url}`" - ); - continue; + bail!("Could not assign `description` from channel item for `{link}`") } }, }; - let _content_id = tx.insert_content(channel_item_id, None, &title, &description)?; - // @TODO preload media + let content_id = tx.insert_content(channel_item_id, None, &title, &description)?; + info!("Add new content record #{content_id} ({title})"); + // persist images if enabled + if let Some(ref selector) = channel_config.persist_images_selector { + use sha2::{Digest, Sha256}; + for element in scraper::Html::parse_document(&description).select(selector) { + if let Some(src) = element.value().attr("src") { + let absolute = match Url::parse(src) { + Ok(url) => url, + Err(e) => { + if e == url::ParseError::RelativeUrlWithoutBase { + let absolute = base.join(link)?; + debug!("Convert relative image link `{link}` to `{absolute}`"); + absolute + } else { + bail!("Could not parse URL from img source: `{e}`") + } + } + }; + let url = absolute.as_str(); + let data = get(url)?.bytes()?; + let hash = format!("{:x}", Sha256::digest(&data)); + + let image_id = match tx.image_id_by_sha256(&hash)? { + Some(image_id) => image_id, + None => { + let image_id = tx.insert_image(&hash, Some(src), Some(url), &data)?; + info!("Persist new image #{image_id} (`{absolute}`)"); + image_id + } + }; + let content_image_id = tx.insert_content_image(content_id, image_id)?; + debug!("Add content image relationship #{content_image_id}") + } + } + } } Ok(()) } diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql index 6c318f2..9524e12 100644 --- a/crates/mysql/database/0.1.0.sql +++ b/crates/mysql/database/0.1.0.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- пт, 09-січ-2026 17:57:03 +0200 +-- сб, 10-січ-2026 14:27:50 +0200 -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -92,10 +92,12 @@ ENGINE = InnoDB; -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`image` ( `image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `source` VARCHAR(2048) NOT NULL, + `sha256` CHAR(64) NOT NULL, + `src` VARCHAR(2048) NULL, + `url` VARCHAR(2048) NULL, `data` MEDIUMBLOB NOT NULL, PRIMARY KEY (`image_id`), - UNIQUE INDEX `source_UNIQUE` (`source` ASC) VISIBLE) + UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE) ENGINE = InnoDB; diff --git a/crates/mysql/src/table.rs b/crates/mysql/src/table.rs index 3ee92ce..631bc37 100644 --- a/crates/mysql/src/table.rs +++ b/crates/mysql/src/table.rs @@ -37,7 +37,13 @@ pub struct Provider { #[derive(Debug, PartialEq, Eq, FromRow)] pub struct Image { pub image_id: u64, - pub source: String, + /// Keep image unique by comparing its data hash + pub sha256: String, + /// Original `src` tag value to post-replacing + pub src: Option, + /// Resolved absolute URL + pub url: Option, + /// Image data, MEDIUMBLOB (16M) pub data: Vec, } diff --git a/crates/mysql/src/transaction.rs b/crates/mysql/src/transaction.rs index a39e290..919b56b 100644 --- a/crates/mysql/src/transaction.rs +++ b/crates/mysql/src/transaction.rs @@ -115,17 +115,23 @@ impl Transaction { Ok(self.tx.last_insert_id().unwrap()) } - pub fn images_total_by_source(&mut self, source: &str) -> Result { - Ok(self - .tx - .exec_first("SELECT COUNT(*) FROM `image` WHERE `source` = ?", (source,))? - .unwrap_or(0)) + pub fn image_id_by_sha256(&mut self, sha256: &str) -> Result, Error> { + self.tx.exec_first( + "SELECT `image_id` FROM `image` WHERE `sha256` = ? LIMIT 1", + (sha256,), + ) } - pub fn insert_image(&mut self, source: &str, data: &[u8]) -> Result { + pub fn insert_image( + &mut self, + sha256: &str, + src: Option<&str>, + url: Option<&str>, + data: &[u8], + ) -> Result { self.tx.exec_drop( - "INSERT INTO `image` SET `source` = ?, `data` = ?", - (source, data), + "INSERT INTO `image` SET `sha256` = ?, `src` = ?, `url` = ?, `data` = ?", + (sha256, src, url, data), )?; Ok(self.tx.last_insert_id().unwrap()) }