implement persist_images_selector, minimize codebase by using bail, change image table structure to use sha256 hash as the unique image identity

This commit is contained in:
yggverse 2026-01-10 14:38:01 +02:00
parent bc61b5c09c
commit ec0cca64f3
7 changed files with 97 additions and 90 deletions

View file

@ -1,9 +1,10 @@
mod argument;
mod config;
use anyhow::Result;
use anyhow::{Result, bail};
use log::{debug, info, warn};
use reqwest::blocking::get;
use url::Url;
fn main() -> Result<()> {
use chrono::Local;
@ -59,64 +60,40 @@ fn main() -> Result<()> {
}
fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
use rss::Channel;
use scraper::Selector;
/// local helper
fn scrape(url: &str, selector: &Selector) -> Result<Option<String>> {
let document = scraper::Html::parse_document(&get(url)?.text()?);
Ok(if let Some(first) = document.select(selector).next() {
Some(first.inner_html())
} else {
warn!("Could not scrape requested inner");
None
})
}
let channel_url = channel_config.url.to_string(); // allocate once
let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
Ok(response) => response.into_items(),
Err(e) => {
warn!("Could not parse response from `{channel_url}`: `{e}`");
return Ok(());
}
};
let channel_items =
match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
Ok(response) => response.into_items(),
Err(e) => bail!("Could not parse response: `{e}`"),
};
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
let channel_id = match tx.channel_id_by_url(&channel_url)? {
Some(channel_id) => channel_id,
None => tx.insert_channel(&channel_url)?,
None => {
let channel_id = tx.insert_channel(&channel_url)?;
info!("Register new channel #{channel_id} ({channel_url})");
channel_id
}
};
for channel_item in channel_items.iter().take(channel_items_limit) {
let guid = match channel_item.guid {
Some(ref guid) => guid.value.as_ref(),
None => {
warn!("Undefined `guid` field in `{channel_url}`");
continue;
}
None => bail!("Undefined `guid` field"),
};
let link = match channel_item.link {
Some(ref link) => link,
None => {
warn!("Undefined `link` field in `{channel_url}`");
continue;
}
let (link, base) = match channel_item.link {
Some(ref link) => (link, Url::parse(link)?),
None => bail!("Undefined `link` field"),
};
let pub_date = match channel_item.pub_date {
Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
Ok(t) => t.timestamp(),
Err(e) => {
warn!("Invalid `pub_date` field in `{channel_url}`: `{e}`");
continue;
}
Err(e) => bail!("Invalid `pub_date` field: `{e}`"),
},
None => {
warn!("Undefined `pub_date` field in `{channel_url}`");
continue;
}
None => bail!("Undefined `pub_date`"),
};
if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 {
continue; // skip next steps as processed
@ -137,57 +114,67 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
None
},
)?;
info!("Register new channel item #{channel_item_id} ({link})");
// preload remote content..
let html = scraper::Html::parse_document(&get(link)?.text()?);
let title = match channel_config.content_title_selector {
Some(ref selector) => match scrape(link, selector) {
Ok(value) => match value {
Some(title) => title,
None => {
warn!("Could not scrape `title` selector in `{channel_url}`");
continue;
}
},
Err(e) => {
warn!("Could not update `title` selector in `{channel_url}`: `{e}`");
continue;
}
Some(ref selector) => match html.select(selector).next() {
Some(title) => title.inner_html(),
None => bail!("Could not scrape `title` selector from `{link}`"),
},
None => match channel_item.title {
Some(ref title) => title.clone(),
None => {
warn!(
"Could not assign `title` from channel item for content in `{channel_url}`"
);
continue;
}
None => bail!("Could not assign `title` from channel item for content in `{link}`"),
},
};
let description = match channel_config.content_description_selector {
Some(ref selector) => match scrape(link, selector) {
Ok(value) => match value {
Some(description) => description,
None => {
warn!("Could not scrape `description` selector in `{channel_url}`");
continue;
}
},
Err(e) => {
warn!("Could not update `description` selector in `{channel_url}`: `{e}`");
continue;
}
Some(ref selector) => match html.select(selector).next() {
Some(description) => description.inner_html(),
None => bail!("Could not scrape `description` selector from `{link}`"),
},
None => match channel_item.description {
Some(ref description) => description.clone(),
None => {
warn!(
"Could not assign `description` from channel item for content in `{channel_url}`"
);
continue;
bail!("Could not assign `description` from channel item for `{link}`")
}
},
};
let _content_id = tx.insert_content(channel_item_id, None, &title, &description)?;
// @TODO preload media
let content_id = tx.insert_content(channel_item_id, None, &title, &description)?;
info!("Add new content record #{content_id} ({title})");
// persist images if enabled
if let Some(ref selector) = channel_config.persist_images_selector {
use sha2::{Digest, Sha256};
for element in scraper::Html::parse_document(&description).select(selector) {
if let Some(src) = element.value().attr("src") {
let absolute = match Url::parse(src) {
Ok(url) => url,
Err(e) => {
if e == url::ParseError::RelativeUrlWithoutBase {
let absolute = base.join(link)?;
debug!("Convert relative image link `{link}` to `{absolute}`");
absolute
} else {
bail!("Could not parse URL from img source: `{e}`")
}
}
};
let url = absolute.as_str();
let data = get(url)?.bytes()?;
let hash = format!("{:x}", Sha256::digest(&data));
let image_id = match tx.image_id_by_sha256(&hash)? {
Some(image_id) => image_id,
None => {
let image_id = tx.insert_image(&hash, Some(src), Some(url), &data)?;
info!("Persist new image #{image_id} (`{absolute}`)");
image_id
}
};
let content_image_id = tx.insert_content_image(content_id, image_id)?;
debug!("Add content image relationship #{content_image_id}")
}
}
}
}
Ok(())
}