implement allowed_tags config option, format config, update documentation comments

This commit is contained in:
yggverse 2026-01-10 22:48:45 +02:00
parent 89cd7cb9cf
commit b34d7cdcdd
3 changed files with 110 additions and 54 deletions

View file

@ -4,29 +4,71 @@ update = 900
# Database connection setup
# * see crates/mysql/database
[mysql]
host = "localhost"
port = 3306
username = ""
password = ""
database = "rssto"
host = "localhost"
port = 3306
username = ""
password = ""
database = "rssto"
# Content sources (unlimited)
[[channel]]
url = "https://"
items_limit = 20
persist_item_title = true
persist_item_description = true
# optional:
# content_title_selector = "h1"
# content_description_selector = "article"
# persist_images_selector = "img"
# RSS feed source
url = "https://1"
# Limit latest channel items to crawl (unlimited by default)
items_limit = 20
# Save Channel item title in the database (currently not in use)
persist_item_title = true
#Save Channel item description in the database (currently not in use)
persist_item_description = true
# Allowed tags
# * empty to strip all tags (default)
allowed_tags = []
# Scrape title by CSS selector
# * None to use Channel item title if exists or fail to continue
# content_title_selector = "h1"
# Scrape description by CSS selector
# * None to use Channel item description if exists or fail to continue
# content_description_selector = "article"
# Preload content images locally if `Some`
# * currently stored in the database
# persist_images_selector = "img"
[[channel]]
url = "https://"
items_limit = 20
persist_item_title = true
persist_item_description = true
# optional:
# content_title_selector = "h1"
# content_description_selector = "article"
# persist_images_selector = "img"
# RSS feed source
url = "https://2"
# Limit latest channel items to crawl (unlimited by default)
items_limit = 20
# Save Channel item title in the database (currently not in use)
persist_item_title = true
#Save Channel item description in the database (currently not in use)
persist_item_description = true
# Allowed tags
# * empty to strip all tags (default)
allowed_tags = []
# Scrape title by CSS selector
# * None to use Channel item title if exists or fail to continue
# content_title_selector = "h1"
# Scrape description by CSS selector
# * None to use Channel item description if exists or fail to continue
# content_description_selector = "article"
# Preload content images locally if `Some`
# * currently stored in the database
# persist_images_selector = "img"

View file

@ -15,18 +15,21 @@ pub struct Mysql {
pub struct Channel {
/// RSS feed source
pub url: Url,
/// Limit channel items (unlimited by default)
/// Limit latest channel items to crawl (unlimited by default)
pub items_limit: Option<usize>,
/// Save item title
/// Save Channel item title in the database (currently not in use)
pub persist_item_title: bool,
/// Save item description
/// Save Channel item description in the database (currently not in use)
pub persist_item_description: bool,
/// Scrape title by CSS selector
/// * None to ignore
/// * None to use Channel item title if exists or fail to continue
pub content_title_selector: Option<Selector>,
/// Scrape description by CSS selector
/// * None to ignore
/// * None to use Channel item description if exists or fail to continue
pub content_description_selector: Option<Selector>,
/// Allowed tags
/// * empty to strip all tags (default)
pub allowed_tags: std::collections::HashSet<String>,
/// Preload content images locally if `Some`
/// * currently stored in the database
pub persist_images_selector: Option<Selector>,

View file

@ -60,11 +60,12 @@ fn main() -> Result<()> {
}
fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
use ammonia::clean;
use std::collections::HashSet;
fn strip_tags(html: &str) -> String {
/// Removes all tags from `html` excluding `allowed_tags` or all if None
fn strip_tags(html: &str, allowed_tags: Option<&HashSet<String>>) -> String {
ammonia::Builder::new()
.tags(std::collections::HashSet::new())
.tags(allowed_tags.map_or(HashSet::new(), |a| a.iter().map(|t| t.as_str()).collect()))
.clean(html)
.to_string()
}
@ -113,12 +114,14 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
guid,
link,
if channel_config.persist_item_title {
channel_item.title().map(strip_tags)
channel_item.title().map(|s| strip_tags(s, None))
} else {
None
},
if channel_config.persist_item_description {
channel_item.description().map(clean)
channel_item
.description()
.map(|s| strip_tags(s, Some(&channel_config.allowed_tags)))
} else {
None
},
@ -126,35 +129,43 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
info!("Register new channel item #{channel_item_id} ({link})");
// preload remote content..
let html = scraper::Html::parse_document(&get(link)?.text()?);
let description = clean(&match channel_config.content_description_selector {
Some(ref selector) => match html.select(selector).next() {
Some(description) => description.inner_html(),
None => bail!("Could not scrape `description` selector from `{link}`"),
let description = strip_tags(
&match channel_config.content_description_selector {
Some(ref selector) => match html.select(selector).next() {
Some(description) => description.inner_html(),
None => bail!("Could not scrape `description` selector from `{link}`"),
},
None => match channel_item.description {
Some(ref description) => description.clone(),
None => {
bail!("Could not assign `description` from channel item for `{link}`")
}
},
},
None => match channel_item.description {
Some(ref description) => description.clone(),
None => {
bail!("Could not assign `description` from channel item for `{link}`")
}
},
});
Some(&channel_config.allowed_tags),
);
let content_id = tx.insert_content(
channel_item_id,
None,
strip_tags(&match channel_config.content_title_selector {
Some(ref selector) => match html.select(selector).next() {
Some(title) => title.inner_html(),
None => bail!("Could not scrape `title` selector from `{link}`"),
strip_tags(
&match channel_config.content_title_selector {
Some(ref selector) => match html.select(selector).next() {
Some(title) => title.inner_html(),
None => bail!("Could not scrape `title` selector from `{link}`"),
},
None => match channel_item.title {
Some(ref title) => title.clone(),
None => {
bail!(
"Could not assign `title` from channel item for content in `{link}`"
)
}
},
},
None => match channel_item.title {
Some(ref title) => title.clone(),
None => {
bail!("Could not assign `title` from channel item for content in `{link}`")
}
},
})
None,
)
.trim(),
clean(&description).trim(),
description.trim(),
)?;
info!("Add new content record #{content_id}");
// persist images if enabled