mirror of
https://github.com/YGGverse/rssto.git
synced 2026-03-31 17:15:29 +00:00
implement allowed_tags config option, format config, update documentation comments
This commit is contained in:
parent
89cd7cb9cf
commit
b34d7cdcdd
3 changed files with 110 additions and 54 deletions
|
|
@ -4,6 +4,7 @@ update = 900
|
||||||
# Database connection setup
|
# Database connection setup
|
||||||
# * see crates/mysql/database
|
# * see crates/mysql/database
|
||||||
[mysql]
|
[mysql]
|
||||||
|
|
||||||
host = "localhost"
|
host = "localhost"
|
||||||
port = 3306
|
port = 3306
|
||||||
username = ""
|
username = ""
|
||||||
|
|
@ -12,21 +13,62 @@ database = "rssto"
|
||||||
|
|
||||||
# Content sources (unlimited)
|
# Content sources (unlimited)
|
||||||
[[channel]]
|
[[channel]]
|
||||||
url = "https://"
|
|
||||||
|
# RSS feed source
|
||||||
|
url = "https://1"
|
||||||
|
|
||||||
|
# Limit latest channel items to crawl (unlimited by default)
|
||||||
items_limit = 20
|
items_limit = 20
|
||||||
|
|
||||||
|
# Save Channel item title in the database (currently not in use)
|
||||||
persist_item_title = true
|
persist_item_title = true
|
||||||
|
|
||||||
|
#Save Channel item description in the database (currently not in use)
|
||||||
persist_item_description = true
|
persist_item_description = true
|
||||||
# optional:
|
|
||||||
|
# Allowed tags
|
||||||
|
# * empty to strip all tags (default)
|
||||||
|
allowed_tags = []
|
||||||
|
|
||||||
|
# Scrape title by CSS selector
|
||||||
|
# * None to use Channel item title if exists or fail to continue
|
||||||
# content_title_selector = "h1"
|
# content_title_selector = "h1"
|
||||||
|
|
||||||
|
# Scrape description by CSS selector
|
||||||
|
# * None to use Channel item description if exists or fail to continue
|
||||||
# content_description_selector = "article"
|
# content_description_selector = "article"
|
||||||
|
|
||||||
|
# Preload content images locally if `Some`
|
||||||
|
# * currently stored in the database
|
||||||
# persist_images_selector = "img"
|
# persist_images_selector = "img"
|
||||||
|
|
||||||
|
|
||||||
[[channel]]
|
[[channel]]
|
||||||
url = "https://"
|
|
||||||
|
# RSS feed source
|
||||||
|
url = "https://2"
|
||||||
|
|
||||||
|
# Limit latest channel items to crawl (unlimited by default)
|
||||||
items_limit = 20
|
items_limit = 20
|
||||||
|
|
||||||
|
# Save Channel item title in the database (currently not in use)
|
||||||
persist_item_title = true
|
persist_item_title = true
|
||||||
|
|
||||||
|
#Save Channel item description in the database (currently not in use)
|
||||||
persist_item_description = true
|
persist_item_description = true
|
||||||
# optional:
|
|
||||||
|
# Allowed tags
|
||||||
|
# * empty to strip all tags (default)
|
||||||
|
allowed_tags = []
|
||||||
|
|
||||||
|
# Scrape title by CSS selector
|
||||||
|
# * None to use Channel item title if exists or fail to continue
|
||||||
# content_title_selector = "h1"
|
# content_title_selector = "h1"
|
||||||
|
|
||||||
|
# Scrape description by CSS selector
|
||||||
|
# * None to use Channel item description if exists or fail to continue
|
||||||
# content_description_selector = "article"
|
# content_description_selector = "article"
|
||||||
|
|
||||||
|
# Preload content images locally if `Some`
|
||||||
|
# * currently stored in the database
|
||||||
# persist_images_selector = "img"
|
# persist_images_selector = "img"
|
||||||
|
|
@ -15,18 +15,21 @@ pub struct Mysql {
|
||||||
pub struct Channel {
|
pub struct Channel {
|
||||||
/// RSS feed source
|
/// RSS feed source
|
||||||
pub url: Url,
|
pub url: Url,
|
||||||
/// Limit channel items (unlimited by default)
|
/// Limit latest channel items to crawl (unlimited by default)
|
||||||
pub items_limit: Option<usize>,
|
pub items_limit: Option<usize>,
|
||||||
/// Save item title
|
/// Save Channel item title in the database (currently not in use)
|
||||||
pub persist_item_title: bool,
|
pub persist_item_title: bool,
|
||||||
/// Save item description
|
/// Save Channel item description in the database (currently not in use)
|
||||||
pub persist_item_description: bool,
|
pub persist_item_description: bool,
|
||||||
/// Scrape title by CSS selector
|
/// Scrape title by CSS selector
|
||||||
/// * None to ignore
|
/// * None to use Channel item title if exists or fail to continue
|
||||||
pub content_title_selector: Option<Selector>,
|
pub content_title_selector: Option<Selector>,
|
||||||
/// Scrape description by CSS selector
|
/// Scrape description by CSS selector
|
||||||
/// * None to ignore
|
/// * None to use Channel item description if exists or fail to continue
|
||||||
pub content_description_selector: Option<Selector>,
|
pub content_description_selector: Option<Selector>,
|
||||||
|
/// Allowed tags
|
||||||
|
/// * empty to strip all tags (default)
|
||||||
|
pub allowed_tags: std::collections::HashSet<String>,
|
||||||
/// Preload content images locally if `Some`
|
/// Preload content images locally if `Some`
|
||||||
/// * currently stored in the database
|
/// * currently stored in the database
|
||||||
pub persist_images_selector: Option<Selector>,
|
pub persist_images_selector: Option<Selector>,
|
||||||
|
|
|
||||||
|
|
@ -60,11 +60,12 @@ fn main() -> Result<()> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
|
fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
|
||||||
use ammonia::clean;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
fn strip_tags(html: &str) -> String {
|
/// Removes all tags from `html` excluding `allowed_tags` or all if None
|
||||||
|
fn strip_tags(html: &str, allowed_tags: Option<&HashSet<String>>) -> String {
|
||||||
ammonia::Builder::new()
|
ammonia::Builder::new()
|
||||||
.tags(std::collections::HashSet::new())
|
.tags(allowed_tags.map_or(HashSet::new(), |a| a.iter().map(|t| t.as_str()).collect()))
|
||||||
.clean(html)
|
.clean(html)
|
||||||
.to_string()
|
.to_string()
|
||||||
}
|
}
|
||||||
|
|
@ -113,12 +114,14 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
|
||||||
guid,
|
guid,
|
||||||
link,
|
link,
|
||||||
if channel_config.persist_item_title {
|
if channel_config.persist_item_title {
|
||||||
channel_item.title().map(strip_tags)
|
channel_item.title().map(|s| strip_tags(s, None))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
},
|
},
|
||||||
if channel_config.persist_item_description {
|
if channel_config.persist_item_description {
|
||||||
channel_item.description().map(clean)
|
channel_item
|
||||||
|
.description()
|
||||||
|
.map(|s| strip_tags(s, Some(&channel_config.allowed_tags)))
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
},
|
},
|
||||||
|
|
@ -126,7 +129,8 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
|
||||||
info!("Register new channel item #{channel_item_id} ({link})");
|
info!("Register new channel item #{channel_item_id} ({link})");
|
||||||
// preload remote content..
|
// preload remote content..
|
||||||
let html = scraper::Html::parse_document(&get(link)?.text()?);
|
let html = scraper::Html::parse_document(&get(link)?.text()?);
|
||||||
let description = clean(&match channel_config.content_description_selector {
|
let description = strip_tags(
|
||||||
|
&match channel_config.content_description_selector {
|
||||||
Some(ref selector) => match html.select(selector).next() {
|
Some(ref selector) => match html.select(selector).next() {
|
||||||
Some(description) => description.inner_html(),
|
Some(description) => description.inner_html(),
|
||||||
None => bail!("Could not scrape `description` selector from `{link}`"),
|
None => bail!("Could not scrape `description` selector from `{link}`"),
|
||||||
|
|
@ -137,11 +141,14 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
|
||||||
bail!("Could not assign `description` from channel item for `{link}`")
|
bail!("Could not assign `description` from channel item for `{link}`")
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
});
|
},
|
||||||
|
Some(&channel_config.allowed_tags),
|
||||||
|
);
|
||||||
let content_id = tx.insert_content(
|
let content_id = tx.insert_content(
|
||||||
channel_item_id,
|
channel_item_id,
|
||||||
None,
|
None,
|
||||||
strip_tags(&match channel_config.content_title_selector {
|
strip_tags(
|
||||||
|
&match channel_config.content_title_selector {
|
||||||
Some(ref selector) => match html.select(selector).next() {
|
Some(ref selector) => match html.select(selector).next() {
|
||||||
Some(title) => title.inner_html(),
|
Some(title) => title.inner_html(),
|
||||||
None => bail!("Could not scrape `title` selector from `{link}`"),
|
None => bail!("Could not scrape `title` selector from `{link}`"),
|
||||||
|
|
@ -149,12 +156,16 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
|
||||||
None => match channel_item.title {
|
None => match channel_item.title {
|
||||||
Some(ref title) => title.clone(),
|
Some(ref title) => title.clone(),
|
||||||
None => {
|
None => {
|
||||||
bail!("Could not assign `title` from channel item for content in `{link}`")
|
bail!(
|
||||||
|
"Could not assign `title` from channel item for content in `{link}`"
|
||||||
|
)
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
})
|
},
|
||||||
|
None,
|
||||||
|
)
|
||||||
.trim(),
|
.trim(),
|
||||||
clean(&description).trim(),
|
description.trim(),
|
||||||
)?;
|
)?;
|
||||||
info!("Add new content record #{content_id}");
|
info!("Add new content record #{content_id}");
|
||||||
// persist images if enabled
|
// persist images if enabled
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue