mirror of
https://github.com/YGGverse/rssto.git
synced 2026-04-02 01:55:30 +00:00
implement persist_images_selector, minimize codebase by using bail, change image table structure to use sha256 hash as the unique image identity
This commit is contained in:
parent
bc61b5c09c
commit
ec0cca64f3
7 changed files with 97 additions and 90 deletions
|
|
@ -19,6 +19,7 @@ reqwest = { version = "0.13.1", features = ["blocking"] }
|
||||||
rss = "2.0.12"
|
rss = "2.0.12"
|
||||||
scraper = { version = "0.25.0", features = ["serde"] }
|
scraper = { version = "0.25.0", features = ["serde"] }
|
||||||
serde = { version = "1.0.228", features = ["derive"] }
|
serde = { version = "1.0.228", features = ["derive"] }
|
||||||
|
sha2 = "0.10.9"
|
||||||
toml = "0.9.10"
|
toml = "0.9.10"
|
||||||
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
|
||||||
url = { version = "2.5.8", features = ["serde"] }
|
url = { version = "2.5.8", features = ["serde"] }
|
||||||
|
|
@ -19,6 +19,7 @@ persist_item_description = true
|
||||||
# optional:
|
# optional:
|
||||||
# content_title_selector = "h1"
|
# content_title_selector = "h1"
|
||||||
# content_description_selector = "article"
|
# content_description_selector = "article"
|
||||||
|
# persist_images_selector = "img"
|
||||||
|
|
||||||
[[channel]]
|
[[channel]]
|
||||||
url = "https://"
|
url = "https://"
|
||||||
|
|
@ -28,3 +29,4 @@ persist_item_description = true
|
||||||
# optional:
|
# optional:
|
||||||
# content_title_selector = "h1"
|
# content_title_selector = "h1"
|
||||||
# content_description_selector = "article"
|
# content_description_selector = "article"
|
||||||
|
# persist_images_selector = "img"
|
||||||
|
|
@ -27,6 +27,9 @@ pub struct Channel {
|
||||||
/// Scrape description by CSS selector
|
/// Scrape description by CSS selector
|
||||||
/// * None to ignore
|
/// * None to ignore
|
||||||
pub content_description_selector: Option<Selector>,
|
pub content_description_selector: Option<Selector>,
|
||||||
|
/// Preload content images locally if `Some`
|
||||||
|
/// * currently stored in the database
|
||||||
|
pub persist_images_selector: Option<Selector>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,10 @@
|
||||||
mod argument;
|
mod argument;
|
||||||
mod config;
|
mod config;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::{Result, bail};
|
||||||
use log::{debug, info, warn};
|
use log::{debug, info, warn};
|
||||||
use reqwest::blocking::get;
|
use reqwest::blocking::get;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
use chrono::Local;
|
use chrono::Local;
|
||||||
|
|
@ -59,64 +60,40 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
|
fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
|
||||||
use rss::Channel;
|
|
||||||
use scraper::Selector;
|
|
||||||
|
|
||||||
/// local helper
|
|
||||||
fn scrape(url: &str, selector: &Selector) -> Result<Option<String>> {
|
|
||||||
let document = scraper::Html::parse_document(&get(url)?.text()?);
|
|
||||||
Ok(if let Some(first) = document.select(selector).next() {
|
|
||||||
Some(first.inner_html())
|
|
||||||
} else {
|
|
||||||
warn!("Could not scrape requested inner");
|
|
||||||
None
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
let channel_url = channel_config.url.to_string(); // allocate once
|
let channel_url = channel_config.url.to_string(); // allocate once
|
||||||
|
|
||||||
let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
|
let channel_items =
|
||||||
Ok(response) => response.into_items(),
|
match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
|
||||||
Err(e) => {
|
Ok(response) => response.into_items(),
|
||||||
warn!("Could not parse response from `{channel_url}`: `{e}`");
|
Err(e) => bail!("Could not parse response: `{e}`"),
|
||||||
return Ok(());
|
};
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
|
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
|
||||||
|
|
||||||
let channel_id = match tx.channel_id_by_url(&channel_url)? {
|
let channel_id = match tx.channel_id_by_url(&channel_url)? {
|
||||||
Some(channel_id) => channel_id,
|
Some(channel_id) => channel_id,
|
||||||
None => tx.insert_channel(&channel_url)?,
|
None => {
|
||||||
|
let channel_id = tx.insert_channel(&channel_url)?;
|
||||||
|
info!("Register new channel #{channel_id} ({channel_url})");
|
||||||
|
channel_id
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
for channel_item in channel_items.iter().take(channel_items_limit) {
|
for channel_item in channel_items.iter().take(channel_items_limit) {
|
||||||
let guid = match channel_item.guid {
|
let guid = match channel_item.guid {
|
||||||
Some(ref guid) => guid.value.as_ref(),
|
Some(ref guid) => guid.value.as_ref(),
|
||||||
None => {
|
None => bail!("Undefined `guid` field"),
|
||||||
warn!("Undefined `guid` field in `{channel_url}`");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
let link = match channel_item.link {
|
let (link, base) = match channel_item.link {
|
||||||
Some(ref link) => link,
|
Some(ref link) => (link, Url::parse(link)?),
|
||||||
None => {
|
None => bail!("Undefined `link` field"),
|
||||||
warn!("Undefined `link` field in `{channel_url}`");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
let pub_date = match channel_item.pub_date {
|
let pub_date = match channel_item.pub_date {
|
||||||
Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
|
Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
|
||||||
Ok(t) => t.timestamp(),
|
Ok(t) => t.timestamp(),
|
||||||
Err(e) => {
|
Err(e) => bail!("Invalid `pub_date` field: `{e}`"),
|
||||||
warn!("Invalid `pub_date` field in `{channel_url}`: `{e}`");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
None => {
|
None => bail!("Undefined `pub_date`"),
|
||||||
warn!("Undefined `pub_date` field in `{channel_url}`");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 {
|
if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 {
|
||||||
continue; // skip next steps as processed
|
continue; // skip next steps as processed
|
||||||
|
|
@ -137,57 +114,67 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
|
||||||
None
|
None
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
|
info!("Register new channel item #{channel_item_id} ({link})");
|
||||||
// preload remote content..
|
// preload remote content..
|
||||||
|
let html = scraper::Html::parse_document(&get(link)?.text()?);
|
||||||
let title = match channel_config.content_title_selector {
|
let title = match channel_config.content_title_selector {
|
||||||
Some(ref selector) => match scrape(link, selector) {
|
Some(ref selector) => match html.select(selector).next() {
|
||||||
Ok(value) => match value {
|
Some(title) => title.inner_html(),
|
||||||
Some(title) => title,
|
None => bail!("Could not scrape `title` selector from `{link}`"),
|
||||||
None => {
|
|
||||||
warn!("Could not scrape `title` selector in `{channel_url}`");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(e) => {
|
|
||||||
warn!("Could not update `title` selector in `{channel_url}`: `{e}`");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
None => match channel_item.title {
|
None => match channel_item.title {
|
||||||
Some(ref title) => title.clone(),
|
Some(ref title) => title.clone(),
|
||||||
None => {
|
None => bail!("Could not assign `title` from channel item for content in `{link}`"),
|
||||||
warn!(
|
|
||||||
"Could not assign `title` from channel item for content in `{channel_url}`"
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
let description = match channel_config.content_description_selector {
|
let description = match channel_config.content_description_selector {
|
||||||
Some(ref selector) => match scrape(link, selector) {
|
Some(ref selector) => match html.select(selector).next() {
|
||||||
Ok(value) => match value {
|
Some(description) => description.inner_html(),
|
||||||
Some(description) => description,
|
None => bail!("Could not scrape `description` selector from `{link}`"),
|
||||||
None => {
|
|
||||||
warn!("Could not scrape `description` selector in `{channel_url}`");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(e) => {
|
|
||||||
warn!("Could not update `description` selector in `{channel_url}`: `{e}`");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
None => match channel_item.description {
|
None => match channel_item.description {
|
||||||
Some(ref description) => description.clone(),
|
Some(ref description) => description.clone(),
|
||||||
None => {
|
None => {
|
||||||
warn!(
|
bail!("Could not assign `description` from channel item for `{link}`")
|
||||||
"Could not assign `description` from channel item for content in `{channel_url}`"
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
let _content_id = tx.insert_content(channel_item_id, None, &title, &description)?;
|
let content_id = tx.insert_content(channel_item_id, None, &title, &description)?;
|
||||||
// @TODO preload media
|
info!("Add new content record #{content_id} ({title})");
|
||||||
|
// persist images if enabled
|
||||||
|
if let Some(ref selector) = channel_config.persist_images_selector {
|
||||||
|
use sha2::{Digest, Sha256};
|
||||||
|
for element in scraper::Html::parse_document(&description).select(selector) {
|
||||||
|
if let Some(src) = element.value().attr("src") {
|
||||||
|
let absolute = match Url::parse(src) {
|
||||||
|
Ok(url) => url,
|
||||||
|
Err(e) => {
|
||||||
|
if e == url::ParseError::RelativeUrlWithoutBase {
|
||||||
|
let absolute = base.join(link)?;
|
||||||
|
debug!("Convert relative image link `{link}` to `{absolute}`");
|
||||||
|
absolute
|
||||||
|
} else {
|
||||||
|
bail!("Could not parse URL from img source: `{e}`")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let url = absolute.as_str();
|
||||||
|
let data = get(url)?.bytes()?;
|
||||||
|
let hash = format!("{:x}", Sha256::digest(&data));
|
||||||
|
|
||||||
|
let image_id = match tx.image_id_by_sha256(&hash)? {
|
||||||
|
Some(image_id) => image_id,
|
||||||
|
None => {
|
||||||
|
let image_id = tx.insert_image(&hash, Some(src), Some(url), &data)?;
|
||||||
|
info!("Persist new image #{image_id} (`{absolute}`)");
|
||||||
|
image_id
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let content_image_id = tx.insert_content_image(content_id, image_id)?;
|
||||||
|
debug!("Add content image relationship #{content_image_id}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
-- MySQL Script generated by MySQL Workbench
|
-- MySQL Script generated by MySQL Workbench
|
||||||
-- пт, 09-січ-2026 17:57:03 +0200
|
-- сб, 10-січ-2026 14:27:50 +0200
|
||||||
-- Model: New Model Version: 1.0
|
-- Model: New Model Version: 1.0
|
||||||
-- MySQL Workbench Forward Engineering
|
-- MySQL Workbench Forward Engineering
|
||||||
|
|
||||||
|
|
@ -92,10 +92,12 @@ ENGINE = InnoDB;
|
||||||
-- -----------------------------------------------------
|
-- -----------------------------------------------------
|
||||||
CREATE TABLE IF NOT EXISTS `rssto`.`image` (
|
CREATE TABLE IF NOT EXISTS `rssto`.`image` (
|
||||||
`image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
|
`image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||||
`source` VARCHAR(2048) NOT NULL,
|
`sha256` CHAR(64) NOT NULL,
|
||||||
|
`src` VARCHAR(2048) NULL,
|
||||||
|
`url` VARCHAR(2048) NULL,
|
||||||
`data` MEDIUMBLOB NOT NULL,
|
`data` MEDIUMBLOB NOT NULL,
|
||||||
PRIMARY KEY (`image_id`),
|
PRIMARY KEY (`image_id`),
|
||||||
UNIQUE INDEX `source_UNIQUE` (`source` ASC) VISIBLE)
|
UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE)
|
||||||
ENGINE = InnoDB;
|
ENGINE = InnoDB;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,13 @@ pub struct Provider {
|
||||||
#[derive(Debug, PartialEq, Eq, FromRow)]
|
#[derive(Debug, PartialEq, Eq, FromRow)]
|
||||||
pub struct Image {
|
pub struct Image {
|
||||||
pub image_id: u64,
|
pub image_id: u64,
|
||||||
pub source: String,
|
/// Keep image unique by comparing its data hash
|
||||||
|
pub sha256: String,
|
||||||
|
/// Original `src` tag value to post-replacing
|
||||||
|
pub src: Option<String>,
|
||||||
|
/// Resolved absolute URL
|
||||||
|
pub url: Option<String>,
|
||||||
|
/// Image data, MEDIUMBLOB (16M)
|
||||||
pub data: Vec<u8>,
|
pub data: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -115,17 +115,23 @@ impl Transaction {
|
||||||
Ok(self.tx.last_insert_id().unwrap())
|
Ok(self.tx.last_insert_id().unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn images_total_by_source(&mut self, source: &str) -> Result<usize, Error> {
|
pub fn image_id_by_sha256(&mut self, sha256: &str) -> Result<Option<u64>, Error> {
|
||||||
Ok(self
|
self.tx.exec_first(
|
||||||
.tx
|
"SELECT `image_id` FROM `image` WHERE `sha256` = ? LIMIT 1",
|
||||||
.exec_first("SELECT COUNT(*) FROM `image` WHERE `source` = ?", (source,))?
|
(sha256,),
|
||||||
.unwrap_or(0))
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn insert_image(&mut self, source: &str, data: &[u8]) -> Result<u64, Error> {
|
pub fn insert_image(
|
||||||
|
&mut self,
|
||||||
|
sha256: &str,
|
||||||
|
src: Option<&str>,
|
||||||
|
url: Option<&str>,
|
||||||
|
data: &[u8],
|
||||||
|
) -> Result<u64, Error> {
|
||||||
self.tx.exec_drop(
|
self.tx.exec_drop(
|
||||||
"INSERT INTO `image` SET `source` = ?, `data` = ?",
|
"INSERT INTO `image` SET `sha256` = ?, `src` = ?, `url` = ?, `data` = ?",
|
||||||
(source, data),
|
(sha256, src, url, data),
|
||||||
)?;
|
)?;
|
||||||
Ok(self.tx.last_insert_id().unwrap())
|
Ok(self.tx.last_insert_id().unwrap())
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue