mirror of
https://github.com/YGGverse/rssto.git
synced 2026-03-31 17:15:29 +00:00
handle some 3'th party errors, init scraper features
This commit is contained in:
parent
6bf89cbc3e
commit
d8f2d723f5
4 changed files with 84 additions and 20 deletions
|
|
@ -17,7 +17,8 @@ log = "0.4.29"
|
||||||
mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" }
|
mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" }
|
||||||
reqwest = { version = "0.13.1", features = ["blocking"] }
|
reqwest = { version = "0.13.1", features = ["blocking"] }
|
||||||
rss = "2.0.12"
|
rss = "2.0.12"
|
||||||
|
scraper = { version = "0.25.0", features = ["serde"] }
|
||||||
serde = { version = "1.0.228", features = ["derive"] }
|
serde = { version = "1.0.228", features = ["derive"] }
|
||||||
toml = "0.9.10"
|
toml = "0.9.10"
|
||||||
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
|
||||||
url = { version = "2.5.8", features = ["serde"] }
|
url = { version = "2.5.8", features = ["serde"] }
|
||||||
|
|
@ -16,9 +16,15 @@ url = "https://"
|
||||||
items_limit = 20
|
items_limit = 20
|
||||||
persist_item_title = true
|
persist_item_title = true
|
||||||
persist_item_description = true
|
persist_item_description = true
|
||||||
|
# optional:
|
||||||
|
# content_title_selector = "h1"
|
||||||
|
# content_description_selector = "article"
|
||||||
|
|
||||||
[[channel]]
|
[[channel]]
|
||||||
url = "https://"
|
url = "https://"
|
||||||
items_limit = 20
|
items_limit = 20
|
||||||
persist_item_title = true
|
persist_item_title = true
|
||||||
persist_item_description = true
|
persist_item_description = true
|
||||||
|
# optional:
|
||||||
|
# content_title_selector = "h1"
|
||||||
|
# content_description_selector = "article"
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
use scraper::Selector;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
|
|
@ -20,14 +21,19 @@ pub struct Channel {
|
||||||
pub persist_item_title: bool,
|
pub persist_item_title: bool,
|
||||||
/// Save item description
|
/// Save item description
|
||||||
pub persist_item_description: bool,
|
pub persist_item_description: bool,
|
||||||
|
/// Scrape title by CSS selector
|
||||||
|
/// * None to ignore
|
||||||
|
pub content_title_selector: Option<Selector>,
|
||||||
|
/// Scrape description by CSS selector
|
||||||
|
/// * None to ignore
|
||||||
|
pub content_description_selector: Option<Selector>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
pub struct Config {
|
pub struct Config {
|
||||||
pub mysql: Mysql,
|
pub mysql: Mysql,
|
||||||
pub channel: Vec<Channel>,
|
pub channel: Vec<Channel>,
|
||||||
/// Update timeout in seconds
|
/// Channels update timeout in seconds
|
||||||
///
|
|
||||||
/// * None to generate once
|
/// * None to generate once
|
||||||
pub update: Option<u64>,
|
pub update: Option<u64>,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ mod config;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use log::{debug, info, warn};
|
use log::{debug, info, warn};
|
||||||
use mysql::Mysql;
|
use mysql::Mysql;
|
||||||
|
use reqwest::blocking::get;
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
use argument::Argument;
|
use argument::Argument;
|
||||||
|
|
@ -27,7 +28,6 @@ fn main() -> Result<()> {
|
||||||
|
|
||||||
let argument = Argument::parse();
|
let argument = Argument::parse();
|
||||||
let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?;
|
let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?;
|
||||||
|
|
||||||
let mut database = Mysql::connect(
|
let mut database = Mysql::connect(
|
||||||
&config.mysql.host,
|
&config.mysql.host,
|
||||||
config.mysql.port,
|
config.mysql.port,
|
||||||
|
|
@ -39,10 +39,10 @@ fn main() -> Result<()> {
|
||||||
info!("Crawler started");
|
info!("Crawler started");
|
||||||
loop {
|
loop {
|
||||||
debug!("Begin new crawl queue...");
|
debug!("Begin new crawl queue...");
|
||||||
for feed in &config.channel {
|
for c in &config.channel {
|
||||||
debug!("Update `{}`...", feed.url);
|
debug!("Update `{}`...", c.url);
|
||||||
if let Err(e) = crawl(&mut database, feed) {
|
if let Err(e) = crawl(&mut database, c) {
|
||||||
warn!("Feed `{}` update failed: `{e}`", feed.url)
|
warn!("Channel `{}` update failed: `{e}`", c.url)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
debug!("Crawl queue completed");
|
debug!("Crawl queue completed");
|
||||||
|
|
@ -56,31 +56,50 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
|
fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
|
||||||
use reqwest::blocking::get;
|
|
||||||
use rss::Channel;
|
use rss::Channel;
|
||||||
|
use scraper::Selector;
|
||||||
|
|
||||||
|
// shared local helpers
|
||||||
|
fn scrape(url: &str, selector: &Selector) -> Result<Option<String>> {
|
||||||
|
let document = scraper::Html::parse_document(&get(url)?.text()?);
|
||||||
|
Ok(if let Some(first) = document.select(selector).next() {
|
||||||
|
Some(first.inner_html())
|
||||||
|
} else {
|
||||||
|
warn!("Could not scrape requested inner");
|
||||||
|
None
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate once
|
||||||
|
let channel_url = channel_config.url.to_string();
|
||||||
|
|
||||||
|
let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
|
||||||
|
Ok(response) => response.into_items(),
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Could not parse response from `{channel_url}`: `{e}`");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let channel = Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..])?;
|
|
||||||
let channel_items = channel.items();
|
|
||||||
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
|
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
|
||||||
|
|
||||||
let feed_url = channel_config.url.to_string();
|
let channel_id = match db.channels_by_url(&channel_url, Some(1))?.first() {
|
||||||
let channel_id = match db.channels_by_url(&feed_url, Some(1))?.first() {
|
|
||||||
Some(result) => result.channel_id,
|
Some(result) => result.channel_id,
|
||||||
None => db.insert_channel(&feed_url)?,
|
None => db.insert_channel(&channel_url)?,
|
||||||
};
|
};
|
||||||
|
|
||||||
for channel_item in channel_items.iter().take(channel_items_limit) {
|
for channel_item in channel_items.iter().take(channel_items_limit) {
|
||||||
let guid = match channel_item.guid {
|
let guid = match channel_item.guid {
|
||||||
Some(ref guid) => guid.value.clone(),
|
Some(ref guid) => guid.value.clone(),
|
||||||
None => {
|
None => {
|
||||||
warn!("Undefined `guid` field in `{feed_url}`");
|
warn!("Undefined `guid` field in `{channel_url}`");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let link = match channel_item.guid {
|
let link = match channel_item.guid {
|
||||||
Some(ref link) => link.value.clone(),
|
Some(ref link) => link.value.clone(),
|
||||||
None => {
|
None => {
|
||||||
warn!("Undefined `link` field in `{feed_url}`");
|
warn!("Undefined `link` field in `{channel_url}`");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
@ -88,12 +107,12 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
|
||||||
Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
|
Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
|
||||||
Ok(t) => t.timestamp(),
|
Ok(t) => t.timestamp(),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
warn!("Invalid `pub_date` field in `{feed_url}`: `{e}`");
|
warn!("Invalid `pub_date` field in `{channel_url}`: `{e}`");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
None => {
|
None => {
|
||||||
warn!("Undefined `pub_date` field in `{feed_url}`");
|
warn!("Undefined `pub_date` field in `{channel_url}`");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
@ -118,7 +137,39 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
|
||||||
None
|
None
|
||||||
},
|
},
|
||||||
)?,
|
)?,
|
||||||
}; // @TODO
|
};
|
||||||
|
|
||||||
|
// @TODO preload remote content
|
||||||
|
|
||||||
|
let title = match channel_config.content_title_selector {
|
||||||
|
Some(ref selector) => match scrape(&link, selector) {
|
||||||
|
Ok(value) => value,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Could not update `title` selector in `{channel_url}`: `{e}`");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let description = match channel_config.content_description_selector {
|
||||||
|
Some(ref selector) => match scrape(&link, selector) {
|
||||||
|
Ok(value) => value,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Could not update `description` selector in `{channel_url}`: `{e}`");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
|
if title.is_none() && description.is_none() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// @TODO insert content record
|
||||||
|
|
||||||
|
println!("{:?}", description)
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue