mirror of
https://github.com/YGGverse/rssto.git
synced 2026-04-02 10:05:32 +00:00
implement html sanitizing
This commit is contained in:
parent
843352bff2
commit
5570049588
3 changed files with 35 additions and 18 deletions
|
|
@ -10,6 +10,7 @@ categories = ["command-line-utilities", "parsing", "text-processing", "value-for
|
||||||
repository = "https://github.com/YGGverse/rssto"
|
repository = "https://github.com/YGGverse/rssto"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
ammonia = "4.1.2"
|
||||||
anyhow = "1.0.100"
|
anyhow = "1.0.100"
|
||||||
chrono = "0.4.42"
|
chrono = "0.4.42"
|
||||||
clap = { version = "4.5.54", features = ["derive"] }
|
clap = { version = "4.5.54", features = ["derive"] }
|
||||||
|
|
|
||||||
|
|
@ -60,6 +60,15 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
|
fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
|
||||||
|
use ammonia::clean;
|
||||||
|
|
||||||
|
fn strip_tags(html: &str) -> String {
|
||||||
|
ammonia::Builder::new()
|
||||||
|
.tags(std::collections::HashSet::new())
|
||||||
|
.clean(html)
|
||||||
|
.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
let channel_url = channel_config.url.to_string(); // allocate once
|
let channel_url = channel_config.url.to_string(); // allocate once
|
||||||
|
|
||||||
let channel_items =
|
let channel_items =
|
||||||
|
|
@ -104,12 +113,12 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
|
||||||
guid,
|
guid,
|
||||||
link,
|
link,
|
||||||
if channel_config.persist_item_title {
|
if channel_config.persist_item_title {
|
||||||
channel_item.title()
|
channel_item.title().map(strip_tags)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
},
|
},
|
||||||
if channel_config.persist_item_description {
|
if channel_config.persist_item_description {
|
||||||
channel_item.description()
|
channel_item.description().map(clean)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
},
|
},
|
||||||
|
|
@ -117,17 +126,7 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
|
||||||
info!("Register new channel item #{channel_item_id} ({link})");
|
info!("Register new channel item #{channel_item_id} ({link})");
|
||||||
// preload remote content..
|
// preload remote content..
|
||||||
let html = scraper::Html::parse_document(&get(link)?.text()?);
|
let html = scraper::Html::parse_document(&get(link)?.text()?);
|
||||||
let title = match channel_config.content_title_selector {
|
let description = clean(&match channel_config.content_description_selector {
|
||||||
Some(ref selector) => match html.select(selector).next() {
|
|
||||||
Some(title) => title.inner_html(),
|
|
||||||
None => bail!("Could not scrape `title` selector from `{link}`"),
|
|
||||||
},
|
|
||||||
None => match channel_item.title {
|
|
||||||
Some(ref title) => title.clone(),
|
|
||||||
None => bail!("Could not assign `title` from channel item for content in `{link}`"),
|
|
||||||
},
|
|
||||||
};
|
|
||||||
let description = match channel_config.content_description_selector {
|
|
||||||
Some(ref selector) => match html.select(selector).next() {
|
Some(ref selector) => match html.select(selector).next() {
|
||||||
Some(description) => description.inner_html(),
|
Some(description) => description.inner_html(),
|
||||||
None => bail!("Could not scrape `description` selector from `{link}`"),
|
None => bail!("Could not scrape `description` selector from `{link}`"),
|
||||||
|
|
@ -138,9 +137,26 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
|
||||||
bail!("Could not assign `description` from channel item for `{link}`")
|
bail!("Could not assign `description` from channel item for `{link}`")
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
};
|
});
|
||||||
let content_id = tx.insert_content(channel_item_id, None, &title, &description)?;
|
let content_id = tx.insert_content(
|
||||||
info!("Add new content record #{content_id} ({title})");
|
channel_item_id,
|
||||||
|
None,
|
||||||
|
strip_tags(&match channel_config.content_title_selector {
|
||||||
|
Some(ref selector) => match html.select(selector).next() {
|
||||||
|
Some(title) => title.inner_html(),
|
||||||
|
None => bail!("Could not scrape `title` selector from `{link}`"),
|
||||||
|
},
|
||||||
|
None => match channel_item.title {
|
||||||
|
Some(ref title) => title.clone(),
|
||||||
|
None => {
|
||||||
|
bail!("Could not assign `title` from channel item for content in `{link}`")
|
||||||
|
}
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.trim(),
|
||||||
|
clean(&description).trim(),
|
||||||
|
)?;
|
||||||
|
info!("Add new content record #{content_id}");
|
||||||
// persist images if enabled
|
// persist images if enabled
|
||||||
if let Some(ref selector) = channel_config.persist_images_selector {
|
if let Some(ref selector) = channel_config.persist_images_selector {
|
||||||
use sha2::{Digest, Sha256};
|
use sha2::{Digest, Sha256};
|
||||||
|
|
|
||||||
|
|
@ -56,8 +56,8 @@ impl Transaction {
|
||||||
pub_date: i64,
|
pub_date: i64,
|
||||||
guid: &str,
|
guid: &str,
|
||||||
link: &str,
|
link: &str,
|
||||||
title: Option<&str>,
|
title: Option<String>,
|
||||||
description: Option<&str>,
|
description: Option<String>,
|
||||||
) -> Result<u64, Error> {
|
) -> Result<u64, Error> {
|
||||||
self.tx.exec_drop(
|
self.tx.exec_drop(
|
||||||
"INSERT INTO `channel_item` SET `channel_id` = ?,
|
"INSERT INTO `channel_item` SET `channel_id` = ?,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue