diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml index 148545a..11d6062 100644 --- a/crates/crawler/Cargo.toml +++ b/crates/crawler/Cargo.toml @@ -17,7 +17,8 @@ log = "0.4.29" mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" } reqwest = { version = "0.13.1", features = ["blocking"] } rss = "2.0.12" +scraper = { version = "0.25.0", features = ["serde"] } serde = { version = "1.0.228", features = ["derive"] } toml = "0.9.10" tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } -url = { version = "2.5.8", features = ["serde"] } +url = { version = "2.5.8", features = ["serde"] } \ No newline at end of file diff --git a/crates/crawler/config/example.toml b/crates/crawler/config/example.toml index fac36cf..bde12ba 100644 --- a/crates/crawler/config/example.toml +++ b/crates/crawler/config/example.toml @@ -16,9 +16,15 @@ url = "https://" items_limit = 20 persist_item_title = true persist_item_description = true +# optional: +# content_title_selector = "h1" +# content_description_selector = "article" [[channel]] url = "https://" items_limit = 20 persist_item_title = true -persist_item_description = true \ No newline at end of file +persist_item_description = true +# optional: +# content_title_selector = "h1" +# content_description_selector = "article" \ No newline at end of file diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs index e9ac4ac..dc325b5 100644 --- a/crates/crawler/src/config.rs +++ b/crates/crawler/src/config.rs @@ -1,3 +1,4 @@ +use scraper::Selector; use serde::Deserialize; use url::Url; @@ -20,14 +21,19 @@ pub struct Channel { pub persist_item_title: bool, /// Save item description pub persist_item_description: bool, + /// Scrape title by CSS selector + /// * None to ignore + pub content_title_selector: Option, + /// Scrape description by CSS selector + /// * None to ignore + pub content_description_selector: Option, } #[derive(Debug, Deserialize)] pub struct Config { pub mysql: Mysql, pub channel: Vec, - /// Update timeout in seconds - /// + /// Channels update timeout in seconds /// * None to generate once pub update: Option, } diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 01aa48e..7cdfc0b 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -4,6 +4,7 @@ mod config; use anyhow::Result; use log::{debug, info, warn}; use mysql::Mysql; +use reqwest::blocking::get; fn main() -> Result<()> { use argument::Argument; @@ -27,7 +28,6 @@ fn main() -> Result<()> { let argument = Argument::parse(); let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?; - let mut database = Mysql::connect( &config.mysql.host, config.mysql.port, @@ -39,10 +39,10 @@ fn main() -> Result<()> { info!("Crawler started"); loop { debug!("Begin new crawl queue..."); - for feed in &config.channel { - debug!("Update `{}`...", feed.url); - if let Err(e) = crawl(&mut database, feed) { - warn!("Feed `{}` update failed: `{e}`", feed.url) + for c in &config.channel { + debug!("Update `{}`...", c.url); + if let Err(e) = crawl(&mut database, c) { + warn!("Channel `{}` update failed: `{e}`", c.url) } } debug!("Crawl queue completed"); @@ -56,31 +56,50 @@ fn main() -> Result<()> { } fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { - use reqwest::blocking::get; use rss::Channel; + use scraper::Selector; + + // shared local helpers + fn scrape(url: &str, selector: &Selector) -> Result> { + let document = scraper::Html::parse_document(&get(url)?.text()?); + Ok(if let Some(first) = document.select(selector).next() { + Some(first.inner_html()) + } else { + warn!("Could not scrape requested inner"); + None + }) + } + + // allocate once + let channel_url = channel_config.url.to_string(); + + let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { + Ok(response) => response.into_items(), + Err(e) => { + warn!("Could not parse response from `{channel_url}`: `{e}`"); + return Ok(()); + } + }; - let channel = Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..])?; - let channel_items = channel.items(); let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); - let feed_url = channel_config.url.to_string(); - let channel_id = match db.channels_by_url(&feed_url, Some(1))?.first() { + let channel_id = match db.channels_by_url(&channel_url, Some(1))?.first() { Some(result) => result.channel_id, - None => db.insert_channel(&feed_url)?, + None => db.insert_channel(&channel_url)?, }; for channel_item in channel_items.iter().take(channel_items_limit) { let guid = match channel_item.guid { Some(ref guid) => guid.value.clone(), None => { - warn!("Undefined `guid` field in `{feed_url}`"); + warn!("Undefined `guid` field in `{channel_url}`"); continue; } }; let link = match channel_item.guid { Some(ref link) => link.value.clone(), None => { - warn!("Undefined `link` field in `{feed_url}`"); + warn!("Undefined `link` field in `{channel_url}`"); continue; } }; @@ -88,12 +107,12 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) { Ok(t) => t.timestamp(), Err(e) => { - warn!("Invalid `pub_date` field in `{feed_url}`: `{e}`"); + warn!("Invalid `pub_date` field in `{channel_url}`: `{e}`"); continue; } }, None => { - warn!("Undefined `pub_date` field in `{feed_url}`"); + warn!("Undefined `pub_date` field in `{channel_url}`"); continue; } }; @@ -118,7 +137,39 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { None }, )?, - }; // @TODO + }; + + // @TODO preload remote content + + let title = match channel_config.content_title_selector { + Some(ref selector) => match scrape(&link, selector) { + Ok(value) => value, + Err(e) => { + warn!("Could not update `title` selector in `{channel_url}`: `{e}`"); + continue; + } + }, + None => None, + }; + + let description = match channel_config.content_description_selector { + Some(ref selector) => match scrape(&link, selector) { + Ok(value) => value, + Err(e) => { + warn!("Could not update `description` selector in `{channel_url}`: `{e}`"); + continue; + } + }, + None => None, + }; + + if title.is_none() && description.is_none() { + continue; + } + + // @TODO insert content record + + println!("{:?}", description) } Ok(()) }