handle some 3'th party errors, init scraper features

2026-03-31 09:05:29 +00:00 · 2026-01-07 18:25:21 +02:00 · 2026-01-07 18:25:21 +02:00 · d8f2d723f5
commit d8f2d723f5
parent 6bf89cbc3e
4 changed files with 84 additions and 20 deletions
--- a/crates/crawler/Cargo.toml
+++ b/crates/crawler/Cargo.toml
@ -17,7 +17,8 @@ log = "0.4.29"
 mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" }
 reqwest = { version = "0.13.1", features = ["blocking"] }
 rss = "2.0.12"
+scraper = { version = "0.25.0", features = ["serde"] }
 serde = { version = "1.0.228", features = ["derive"] }
 toml = "0.9.10"
 tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
-url = { version = "2.5.8", features = ["serde"] }
+url = { version = "2.5.8", features = ["serde"] }
--- a/crates/crawler/config/example.toml
+++ b/crates/crawler/config/example.toml
@ -16,9 +16,15 @@ url = "https://"
 items_limit = 20
 persist_item_title = true
 persist_item_description = true
+# optional:
+# content_title_selector = "h1"
+# content_description_selector = "article"

 [[channel]]
 url = "https://"
 items_limit = 20
 persist_item_title = true
-persist_item_description = true
+persist_item_description = true
+# optional:
+# content_title_selector = "h1"
+# content_description_selector = "article"
--- a/crates/crawler/src/config.rs
+++ b/crates/crawler/src/config.rs
@ -1,3 +1,4 @@
+use scraper::Selector;
 use serde::Deserialize;
 use url::Url;

@ -20,14 +21,19 @@ pub struct Channel {
    pub persist_item_title: bool,
    /// Save item description
    pub persist_item_description: bool,
+    /// Scrape title by CSS selector
+    /// * None to ignore
+    pub content_title_selector: Option<Selector>,
+    /// Scrape description by CSS selector
+    /// * None to ignore
+    pub content_description_selector: Option<Selector>,
 }

 #[derive(Debug, Deserialize)]
 pub struct Config {
    pub mysql: Mysql,
    pub channel: Vec<Channel>,
-    /// Update timeout in seconds
-    ///
+    /// Channels update timeout in seconds
    /// * None to generate once
    pub update: Option<u64>,
 }
--- a/crates/crawler/src/main.rs
+++ b/crates/crawler/src/main.rs
@ -4,6 +4,7 @@ mod config;
 use anyhow::Result;
 use log::{debug, info, warn};
 use mysql::Mysql;
+use reqwest::blocking::get;

 fn main() -> Result<()> {
    use argument::Argument;
@ -27,7 +28,6 @@ fn main() -> Result<()> {

    let argument = Argument::parse();
    let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?;
-
    let mut database = Mysql::connect(
        &config.mysql.host,
        config.mysql.port,
@ -39,10 +39,10 @@ fn main() -> Result<()> {
    info!("Crawler started");
    loop {
        debug!("Begin new crawl queue...");
-        for feed in &config.channel {
-            debug!("Update `{}`...", feed.url);
-            if let Err(e) = crawl(&mut database, feed) {
-                warn!("Feed `{}` update failed: `{e}`", feed.url)
+        for c in &config.channel {
+            debug!("Update `{}`...", c.url);
+            if let Err(e) = crawl(&mut database, c) {
+                warn!("Channel `{}` update failed: `{e}`", c.url)
            }
        }
        debug!("Crawl queue completed");
@ -56,31 +56,50 @@ fn main() -> Result<()> {
 }

 fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
-    use reqwest::blocking::get;
    use rss::Channel;
+    use scraper::Selector;
+
+    // shared local helpers
+    fn scrape(url: &str, selector: &Selector) -> Result<Option<String>> {
+        let document = scraper::Html::parse_document(&get(url)?.text()?);
+        Ok(if let Some(first) = document.select(selector).next() {
+            Some(first.inner_html())
+        } else {
+            warn!("Could not scrape requested inner");
+            None
+        })
+    }
+
+    // allocate once
+    let channel_url = channel_config.url.to_string();
+
+    let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
+        Ok(response) => response.into_items(),
+        Err(e) => {
+            warn!("Could not parse response from `{channel_url}`: `{e}`");
+            return Ok(());
+        }
+    };

-    let channel = Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..])?;
-    let channel_items = channel.items();
    let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());

-    let feed_url = channel_config.url.to_string();
-    let channel_id = match db.channels_by_url(&feed_url, Some(1))?.first() {
+    let channel_id = match db.channels_by_url(&channel_url, Some(1))?.first() {
        Some(result) => result.channel_id,
-        None => db.insert_channel(&feed_url)?,
+        None => db.insert_channel(&channel_url)?,
    };

    for channel_item in channel_items.iter().take(channel_items_limit) {
        let guid = match channel_item.guid {
            Some(ref guid) => guid.value.clone(),
            None => {
-                warn!("Undefined `guid` field in `{feed_url}`");
+                warn!("Undefined `guid` field in `{channel_url}`");
                continue;
            }
        };
        let link = match channel_item.guid {
            Some(ref link) => link.value.clone(),
            None => {
-                warn!("Undefined `link` field in `{feed_url}`");
+                warn!("Undefined `link` field in `{channel_url}`");
                continue;
            }
        };
@ -88,12 +107,12 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
            Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
                Ok(t) => t.timestamp(),
                Err(e) => {
-                    warn!("Invalid `pub_date` field in `{feed_url}`: `{e}`");
+                    warn!("Invalid `pub_date` field in `{channel_url}`: `{e}`");
                    continue;
                }
            },
            None => {
-                warn!("Undefined `pub_date` field in `{feed_url}`");
+                warn!("Undefined `pub_date` field in `{channel_url}`");
                continue;
            }
        };
@ -118,7 +137,39 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
                    None
                },
            )?,
-        }; // @TODO
+        };
+
+        // @TODO preload remote content
+
+        let title = match channel_config.content_title_selector {
+            Some(ref selector) => match scrape(&link, selector) {
+                Ok(value) => value,
+                Err(e) => {
+                    warn!("Could not update `title` selector in `{channel_url}`: `{e}`");
+                    continue;
+                }
+            },
+            None => None,
+        };
+
+        let description = match channel_config.content_description_selector {
+            Some(ref selector) => match scrape(&link, selector) {
+                Ok(value) => value,
+                Err(e) => {
+                    warn!("Could not update `description` selector in `{channel_url}`: `{e}`");
+                    continue;
+                }
+            },
+            None => None,
+        };
+
+        if title.is_none() && description.is_none() {
+            continue;
+        }
+
+        // @TODO insert content record
+
+        println!("{:?}", description)
    }
    Ok(())
 }