normalize db tables, optionally persist channel descriptions, remove entries logic from the crawler, update config options

This commit is contained in:
yggverse 2026-01-11 20:36:00 +02:00
parent 7e4d9e3ed6
commit 2b804d8915
10 changed files with 500 additions and 249 deletions

View file

@ -18,25 +18,28 @@ update = 900
url = "https://1" url = "https://1"
# Limit latest channel items to crawl (unlimited by default) # Limit latest channel items to crawl (unlimited by default)
items_limit = 20 items_limit = 5
# Save Channel item title in the database (currently not in use) # Save Channel `title` and `description` in the database (currently not in use)
persist_item_title = true persist_description = true
#Save Channel item description in the database (currently not in use) # Save Channel item `title` and `description` in the database
persist_item_description = true persist_item_description = true
# Allowed tags # Allowed tags
# * empty to strip all tags (default) # * empty to strip all tags (default)
allowed_tags = [] allowed_tags = ["a", "br", "p", "img"]
# Grab Channel item content (from the item `link`)
scrape_item_content = false
# Scrape title by CSS selector # Scrape title by CSS selector
# * None to use Channel item title if exists or fail to continue # * None to use Channel item title if exists or fail to continue
# content_title_selector = "h1" # scrape_item_content_title_selector = "h1"
# Scrape description by CSS selector # Scrape description by CSS selector
# * None to use Channel item description if exists or fail to continue # * None to use Channel item description if exists or fail to continue
# content_description_selector = "article" # scrape_item_content_description_selector = "article"
# Preload content images locally if `Some` # Preload content images locally if `Some`
# * currently stored in the database # * currently stored in the database
@ -49,25 +52,28 @@ update = 900
url = "https://2" url = "https://2"
# Limit latest channel items to crawl (unlimited by default) # Limit latest channel items to crawl (unlimited by default)
items_limit = 20 items_limit = 5
# Save Channel item title in the database (currently not in use) # Save Channel `title` and `description` in the database (currently not in use)
persist_item_title = true persist_description = true
#Save Channel item description in the database (currently not in use) # Save Channel item `title` and `description` in the database
persist_item_description = true persist_item_description = true
# Allowed tags # Allowed tags
# * empty to strip all tags (default) # * empty to strip all tags (default)
allowed_tags = [] allowed_tags = ["a", "br", "p", "img"]
# Grab Channel item content (from the item `link`)
scrape_item_content = false
# Scrape title by CSS selector # Scrape title by CSS selector
# * None to use Channel item title if exists or fail to continue # * None to use Channel item title if exists or fail to continue
# content_title_selector = "h1" # scrape_item_content_title_selector = "h1"
# Scrape description by CSS selector # Scrape description by CSS selector
# * None to use Channel item description if exists or fail to continue # * None to use Channel item description if exists or fail to continue
# content_description_selector = "article" # scrape_item_content_description_selector = "article"
# Preload content images locally if `Some` # Preload content images locally if `Some`
# * currently stored in the database # * currently stored in the database

View file

@ -17,16 +17,18 @@ pub struct Channel {
pub url: Url, pub url: Url,
/// Limit latest channel items to crawl (unlimited by default) /// Limit latest channel items to crawl (unlimited by default)
pub items_limit: Option<usize>, pub items_limit: Option<usize>,
/// Save Channel item title in the database (currently not in use) /// Save Channel title and description in the database
pub persist_item_title: bool, pub persist_description: bool,
/// Save Channel item description in the database (currently not in use) /// Save Channel item title and description in the database
pub persist_item_description: bool, pub persist_item_description: bool,
/// Grab Channel item content (from the item `link`)
pub scrape_item_content: bool,
/// Scrape title by CSS selector /// Scrape title by CSS selector
/// * None to use Channel item title if exists or fail to continue /// * None to use Channel item title if exists or fail to continue
pub content_title_selector: Option<Selector>, pub scrape_item_content_title_selector: Option<Selector>,
/// Scrape description by CSS selector /// Scrape description by CSS selector
/// * None to use Channel item description if exists or fail to continue /// * None to use Channel item description if exists or fail to continue
pub content_description_selector: Option<Selector>, pub scrape_item_content_description_selector: Option<Selector>,
/// Allowed tags /// Allowed tags
/// * empty to strip all tags (default) /// * empty to strip all tags (default)
pub allowed_tags: std::collections::HashSet<String>, pub allowed_tags: std::collections::HashSet<String>,

View file

@ -72,14 +72,6 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
let channel_url = channel_config.url.to_string(); // allocate once let channel_url = channel_config.url.to_string(); // allocate once
let channel_items =
match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
Ok(response) => response.into_items(),
Err(e) => bail!("Could not parse response: `{e}`"),
};
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
let channel_id = match tx.channel_id_by_url(&channel_url)? { let channel_id = match tx.channel_id_by_url(&channel_url)? {
Some(channel_id) => channel_id, Some(channel_id) => channel_id,
None => { None => {
@ -89,6 +81,28 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
} }
}; };
let channel_items =
match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
Ok(channel) => {
if channel_config.persist_description {
let channel_description_id = tx.insert_channel_description(
channel_id,
None,
Some(strip_tags(channel.title(), None)),
Some(strip_tags(
channel.description(),
Some(&channel_config.allowed_tags),
)),
)?;
debug!("Save channel description #{channel_description_id}")
}
channel.into_items()
}
Err(e) => bail!("Could not parse response: `{e}`"),
};
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
for channel_item in channel_items.iter().take(channel_items_limit) { for channel_item in channel_items.iter().take(channel_items_limit) {
let guid = match channel_item.guid { let guid = match channel_item.guid {
Some(ref guid) => guid.value.as_ref(), Some(ref guid) => guid.value.as_ref(),
@ -106,72 +120,62 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
None => bail!("Undefined `pub_date`"), None => bail!("Undefined `pub_date`"),
}; };
if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 { if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 {
debug!("Channel item `{guid}` already exists, skipped.");
continue; // skip next steps as processed continue; // skip next steps as processed
} }
let channel_item_id = tx.insert_channel_item( let channel_item_id = tx.insert_channel_item(channel_id, pub_date, guid, link)?;
channel_id,
pub_date,
guid,
link,
if channel_config.persist_item_title {
channel_item.title().map(|s| strip_tags(s, None))
} else {
None
},
if channel_config.persist_item_description {
channel_item
.description()
.map(|s| strip_tags(s, Some(&channel_config.allowed_tags)))
} else {
None
},
)?;
info!("Register new channel item #{channel_item_id} ({link})"); info!("Register new channel item #{channel_item_id} ({link})");
// preload remote content.. if channel_config.persist_item_description {
let html = scraper::Html::parse_document(&get(link)?.text()?); let channel_item_description_id = tx.insert_channel_item_description(
let description = strip_tags(
&match channel_config.content_description_selector {
Some(ref selector) => match html.select(selector).next() {
Some(description) => description.inner_html(),
None => bail!("Could not scrape `description` selector from `{link}`"),
},
None => match channel_item.description {
Some(ref description) => description.clone(),
None => {
bail!("Could not assign `description` from channel item for `{link}`")
}
},
},
Some(&channel_config.allowed_tags),
);
let content_id = tx.insert_content(
channel_item_id, channel_item_id,
None, None,
strip_tags( channel_item.title().map(|s| strip_tags(s, None)),
&match channel_config.content_title_selector { channel_item
.description()
.map(|s| strip_tags(s, Some(&channel_config.allowed_tags))),
)?;
debug!("Save channel item description #{channel_item_description_id}")
}
// preload remote content..
if !channel_config.scrape_item_content {
continue;
}
let channel_item_content_id = tx.insert_channel_item_content(channel_item_id)?;
info!("Add new content record #{channel_item_content_id}");
let html = scraper::Html::parse_document(&get(link)?.text()?);
let description = match channel_config.scrape_item_content_description_selector {
Some(ref selector) => match html.select(selector).next() { Some(ref selector) => match html.select(selector).next() {
Some(title) => title.inner_html(), Some(description) => Some(strip_tags(
&description.inner_html(),
Some(&channel_config.allowed_tags),
)),
None => bail!("Could not scrape `description` selector from `{link}`"),
},
None => None,
};
let channel_item_content_description_id = tx.insert_channel_item_content_description(
channel_item_content_id,
None,
match channel_config.scrape_item_content_title_selector {
Some(ref selector) => match html.select(selector).next() {
Some(title) => Some(strip_tags(&title.inner_html(), None)),
None => bail!("Could not scrape `title` selector from `{link}`"), None => bail!("Could not scrape `title` selector from `{link}`"),
}, },
None => match channel_item.title { None => None,
Some(ref title) => title.clone(),
None => {
bail!(
"Could not assign `title` from channel item for content in `{link}`"
)
} }
}, .as_ref()
}, .map(|s| s.trim()),
None, description.as_ref().map(|s| s.trim()),
)
.trim(),
description.trim(),
)?; )?;
info!("Add new content record #{content_id}"); debug!("Save channel item content description #{channel_item_content_description_id}");
// persist images if enabled // persist images if enabled
if let Some(ref selector) = channel_config.persist_images_selector { if let Some(ref selector) = channel_config.persist_images_selector {
use sha2::{Digest, Sha256}; use sha2::{Digest, Sha256};
for element in scraper::Html::parse_document(&description).select(selector) { if description.is_none() {
bail!("Field `description` is required to scrape images from `{link}`")
}
for element in scraper::Html::parse_document(&description.unwrap()).select(selector) {
if let Some(src) = element.value().attr("src") { if let Some(src) = element.value().attr("src") {
let absolute = match Url::parse(src) { let absolute = match Url::parse(src) {
Ok(url) => url, Ok(url) => url,
@ -197,10 +201,15 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
image_id image_id
} }
}; };
let content_image_id = tx.insert_content_image(content_id, image_id)?; let content_image_id =
tx.insert_content_image(channel_item_content_id, image_id)?;
debug!("Add content image relationship #{content_image_id}"); debug!("Add content image relationship #{content_image_id}");
let uri = format!("/image/{image_id}"); let uri = format!("/image/{image_id}");
tx.replace_content_description(content_id, src, &uri)?; tx.replace_channel_item_content_description(
channel_item_content_id,
src,
&uri,
)?;
debug!("Replace content image in description from `{src}` to `{uri}`") debug!("Replace content image in description from `{src}` to `{uri}`")
} }
} }

View file

@ -31,7 +31,7 @@ fn index(
#[derive(Serialize)] #[derive(Serialize)]
#[serde(crate = "rocket::serde")] #[serde(crate = "rocket::serde")]
struct Row { struct Row {
content_id: u64, channel_item_content_description_id: u64,
link: String, link: String,
time: String, time: String,
title: String, title: String,
@ -41,7 +41,7 @@ fn index(
Status::InternalServerError Status::InternalServerError
})?; })?;
let total = conn let total = conn
.contents_total_by_provider_id(global.provider_id, search) .channel_item_content_descriptions_total_by_provider_id(global.provider_id, search)
.map_err(|e| { .map_err(|e| {
error!("Could not get contents total: `{e}`"); error!("Could not get contents total: `{e}`");
Status::InternalServerError Status::InternalServerError
@ -73,7 +73,7 @@ fn index(
back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))), back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))),
next: if page.unwrap_or(1) * global.list_limit >= total { None } next: if page.unwrap_or(1) * global.list_limit >= total { None }
else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) }, else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) },
rows: conn.contents_by_provider_id( rows: conn.channel_item_content_descriptions_by_provider_id(
global.provider_id, global.provider_id,
search, search,
Sort::Desc, Sort::Desc,
@ -84,13 +84,16 @@ fn index(
Status::InternalServerError Status::InternalServerError
})? })?
.into_iter() .into_iter()
.map(|content| { .map(|channel_item_content_description| {
let channel_item = conn.channel_item(content.channel_item_id).unwrap().unwrap(); let channel_item = conn.channel_item(
channel_item_content_description.channel_item_content_id
).unwrap().unwrap();
Row { Row {
content_id: content.content_id, channel_item_content_description_id:
channel_item_content_description.channel_item_content_description_id,
link: channel_item.link, link: channel_item.link,
time: time(channel_item.pub_date).format(&global.format_time).to_string(), time: time(channel_item.pub_date).format(&global.format_time).to_string(),
title: content.title, title: channel_item_content_description.title.unwrap_or_default(), // @TODO handle
} }
}) })
.collect::<Vec<Row>>(), .collect::<Vec<Row>>(),
@ -102,9 +105,9 @@ fn index(
)) ))
} }
#[get("/<content_id>")] #[get("/<channel_item_content_description_id>")]
fn info( fn info(
content_id: u64, channel_item_content_description_id: u64,
db: &State<Database>, db: &State<Database>,
meta: &State<Meta>, meta: &State<Meta>,
global: &State<Global>, global: &State<Global>,
@ -113,29 +116,52 @@ fn info(
error!("Could not connect database: `{e}`"); error!("Could not connect database: `{e}`");
Status::InternalServerError Status::InternalServerError
})?; })?;
match conn.content(content_id).map_err(|e| { match conn.channel_item_content_description(channel_item_content_description_id).map_err(|e| {
error!("Could not get content `{content_id}`: `{e}`"); error!("Could not get `channel_item_content_description_id` {channel_item_content_description_id}: `{e}`");
Status::InternalServerError Status::InternalServerError
})? { })? {
Some(content) => { Some(channel_item_content_description) => {
let channel_item = conn let channel_item_content = conn
.channel_item(content.channel_item_id) .channel_item_content(channel_item_content_description.channel_item_content_id)
.map_err(|e| { .map_err(|e| {
error!("Could not get requested channel item: `{e}`"); error!(
"Could not get requested `channel_item_content` #{}: `{e}`",
channel_item_content_description.channel_item_content_id
);
Status::InternalServerError Status::InternalServerError
})? })?
.ok_or_else(|| { .ok_or_else(|| {
error!("Could not find requested channel item"); error!(
"Could not find requested `channel_item_content` #{}",
channel_item_content_description.channel_item_content_id
);
Status::NotFound Status::NotFound
})?; })?;
let channel_item = conn
.channel_item(channel_item_content.channel_item_id)
.map_err(|e| {
error!(
"Could not get requested `channel_item` #{}: `{e}`",
channel_item_content.channel_item_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!(
"Could not find requested `channel_item` #{}",
channel_item_content.channel_item_id
);
Status::NotFound
})?;
let title = channel_item_content_description.title.unwrap_or_default(); // @TODO handle
Ok(Template::render( Ok(Template::render(
"info", "info",
context! { context! {
description: content.description, description: channel_item_content_description.description,
link: channel_item.link, link: channel_item.link,
meta: meta.inner(), meta: meta.inner(),
title: format!("{}{S}{}", content.title, meta.title), title: format!("{title}{S}{}", meta.title),
name: content.title, name: title,
time: time(channel_item.pub_date).format(&global.format_time).to_string(), time: time(channel_item.pub_date).format(&global.format_time).to_string(),
}, },
)) ))
@ -175,8 +201,8 @@ fn rss(
error!("Could not connect database: `{e}`"); error!("Could not connect database: `{e}`");
Status::InternalServerError Status::InternalServerError
})?; })?;
for content in conn for channel_item_content_description in conn
.contents_by_provider_id( .channel_item_content_descriptions_by_provider_id(
global.provider_id, global.provider_id,
search, search,
Sort::Desc, Sort::Desc,
@ -184,26 +210,53 @@ fn rss(
Some(global.list_limit), Some(global.list_limit),
) )
.map_err(|e| { .map_err(|e| {
error!("Could not load channel item contents: `{e}`"); error!(
"Could not load `channel_item_content_description` for `provider` #{:?}: `{e}`",
global.provider_id
);
Status::InternalServerError Status::InternalServerError
})? })?
{ {
let channel_item = conn let channel_item_content = conn
.channel_item(content.channel_item_id) .channel_item_content(channel_item_content_description.channel_item_content_id)
.map_err(|e| { .map_err(|e| {
error!("Could not get requested channel item: `{e}`"); error!(
"Could not get requested `channel_item_content` #{}: `{e}`",
channel_item_content_description.channel_item_content_id
);
Status::InternalServerError Status::InternalServerError
})? })?
.ok_or_else(|| { .ok_or_else(|| {
error!("Could not find requested channel item"); error!(
"Could not find requested `channel_item_content` #{}",
channel_item_content_description.channel_item_content_id
);
Status::NotFound
})?;
let channel_item = conn
.channel_item(channel_item_content.channel_item_id)
.map_err(|e| {
error!(
"Could not get requested `channel_item` #{}: `{e}`",
channel_item_content.channel_item_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!(
"Could not find requested `channel_item` #{}",
channel_item_content.channel_item_id
);
Status::NotFound Status::NotFound
})?; })?;
feed.push( feed.push(
content.channel_item_id, channel_item_content_description.channel_item_content_description_id,
time(channel_item.pub_date), time(channel_item.pub_date),
channel_item.link, channel_item.link,
content.title, channel_item_content_description.title.unwrap_or_default(), // @TODO handle
content.description, channel_item_content_description
.description
.unwrap_or_default(), // @TODO handle
) )
} }
Ok(RawXml(feed.commit())) Ok(RawXml(feed.commit()))

View file

@ -3,13 +3,15 @@
{% if rows %} {% if rows %}
{% for row in rows %} {% for row in rows %}
<div> <div>
<a name="{{ row.content_id }}"></a> <a name="{{ row.channel_item_content_description_id }}"></a>
<h2><a href="{{ row.content_id }}">{{ row.title }}</a></h2> <h2><a href="{{ row.channel_item_content_description_id }}">{{ row.title }}</a></h2>
<p>{{ row.time }}</p> <p>{{ row.time }}</p>
</div> </div>
{% endfor %} {% endfor %}
{% else %} {% else %}
<div>Nothing.</div> <div>
<p>Nothing.</p>
</div>
{% endif %} {% endif %}
{% if next %}<a href="{{ next }}">Next</a>{% endif %} {% if next %}<a href="{{ next }}">Next</a>{% endif %}
{% if back %}<a href="{{ back }}">Back</a>{% endif %} {% if back %}<a href="{{ back }}">Back</a>{% endif %}

View file

@ -67,35 +67,50 @@ async fn main() -> Result<()> {
loop { loop {
debug!("New queue begin..."); debug!("New queue begin...");
let mut tx = db.transaction()?; let mut tx = db.transaction()?;
for source in tx.contents_queue_for_provider_id(provider_id)? { for channel_item_content_description in
tx.channel_item_content_descriptions_queue_for_provider_id(provider_id)?
{
debug!( debug!(
"Begin generating `content_id` #{} using `provider_id` #{provider_id}.", "Begin generating `channel_item_content_description` #{} using `provider_id` #{provider_id}.",
source.content_id channel_item_content_description.channel_item_content_description_id
); );
let title = match channel_item_content_description.title {
let title = llm Some(subject) => Some(
.chat_completion(ChatCompletionRequest::new(&config.llm.model).message( llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message(
Message::user(format!("{}\n{}", config.llm.message, source.title)), Message::user(format!("{}\n{}", config.llm.message, subject)),
)) ))
.await?; .await?
.choices[0]
let description = llm .message
.chat_completion(ChatCompletionRequest::new(&config.llm.model).message( .content
Message::user(format!("{}\n{}", config.llm.message, source.description)), .trim()
.to_string(),
),
None => None,
};
let description = match channel_item_content_description.description {
Some(subject) => Some(
llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message(
Message::user(format!("{}\n{}", config.llm.message, subject)),
)) ))
.await?; .await?
.choices[0]
let content_id = tx.insert_content( .message
source.channel_item_id, .content
.trim()
.to_string(),
),
None => None,
};
let channel_item_content_description_id = tx.insert_channel_item_content_description(
channel_item_content_description.channel_item_content_id,
Some(provider_id), Some(provider_id),
&title.choices[0].message.content, title.as_deref(),
&description.choices[0].message.content, description.as_deref(),
)?; )?;
info!(
debug!( "Create `channel_item_content_description` #{channel_item_content_description_id} by `provider_id` #{provider_id}."
"Created `content_id` #{content_id} using `content_id` #{} source by `provider_id` #{provider_id}.", );
source.content_id
)
} }
tx.commit()?; tx.commit()?;
debug!("Queue completed"); debug!("Queue completed");

View file

@ -1,5 +1,5 @@
-- MySQL Script generated by MySQL Workbench -- MySQL Script generated by MySQL Workbench
-- сб, 10-січ-2026 14:27:50 +0200 -- нд, 11-січ-2026 20:33:40 +0200
-- Model: New Model Version: 1.0 -- Model: New Model Version: 1.0
-- MySQL Workbench Forward Engineering -- MySQL Workbench Forward Engineering
@ -21,7 +21,7 @@ USE `rssto` ;
-- Table `rssto`.`channel` -- Table `rssto`.`channel`
-- ----------------------------------------------------- -- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel` ( CREATE TABLE IF NOT EXISTS `rssto`.`channel` (
`channel_id` INT NOT NULL AUTO_INCREMENT, `channel_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`url` VARCHAR(255) NOT NULL, `url` VARCHAR(255) NOT NULL,
PRIMARY KEY (`channel_id`), PRIMARY KEY (`channel_id`),
UNIQUE INDEX `url_UNIQUE` (`url` ASC) VISIBLE) UNIQUE INDEX `url_UNIQUE` (`url` ASC) VISIBLE)
@ -32,14 +32,12 @@ ENGINE = InnoDB;
-- Table `rssto`.`channel_item` -- Table `rssto`.`channel_item`
-- ----------------------------------------------------- -- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` ( CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` (
`channel_item_id` INT NOT NULL AUTO_INCREMENT, `channel_item_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_id` INT NOT NULL, `channel_id` INT UNSIGNED NOT NULL,
`pub_date` BIGINT NOT NULL, `pub_date` BIGINT NOT NULL,
`guid` VARCHAR(255) NOT NULL, `guid` VARCHAR(255) NOT NULL,
`link` VARCHAR(255) NOT NULL, `link` VARCHAR(255) NOT NULL,
`title` VARCHAR(255) NULL, PRIMARY KEY (`channel_item_id`, `channel_id`),
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_item_id`),
INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE, INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`guid` ASC, `channel_id` ASC) VISIBLE, UNIQUE INDEX `UNIQUE` (`guid` ASC, `channel_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_channel` CONSTRAINT `fk_channel_item_channel`
@ -54,7 +52,7 @@ ENGINE = InnoDB;
-- Table `rssto`.`provider` -- Table `rssto`.`provider`
-- ----------------------------------------------------- -- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`provider` ( CREATE TABLE IF NOT EXISTS `rssto`.`provider` (
`provider_id` INT NOT NULL AUTO_INCREMENT, `provider_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`name` VARCHAR(255) NOT NULL, `name` VARCHAR(255) NOT NULL,
PRIMARY KEY (`provider_id`), PRIMARY KEY (`provider_id`),
UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE) UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE)
@ -62,27 +60,17 @@ ENGINE = InnoDB;
-- ----------------------------------------------------- -- -----------------------------------------------------
-- Table `rssto`.`content` -- Table `rssto`.`channel_item_content`
-- ----------------------------------------------------- -- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`content` ( CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content` (
`content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `channel_item_content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT NOT NULL, `channel_item_id` INT UNSIGNED NOT NULL,
`provider_id` INT NULL, PRIMARY KEY (`channel_item_content_id`, `channel_item_id`),
`title` VARCHAR(255) NOT NULL, INDEX `fk_channel_item_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
`description` LONGTEXT NOT NULL, CONSTRAINT `fk_channel_item_content_channel_item`
PRIMARY KEY (`content_id`),
INDEX `fk_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
INDEX `fk_content_provider_idx` (`provider_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_content_channel_item`
FOREIGN KEY (`channel_item_id`) FOREIGN KEY (`channel_item_id`)
REFERENCES `rssto`.`channel_item` (`channel_item_id`) REFERENCES `rssto`.`channel_item` (`channel_item_id`)
ON DELETE NO ACTION ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_content_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION) ON UPDATE NO ACTION)
ENGINE = InnoDB; ENGINE = InnoDB;
@ -92,31 +80,38 @@ ENGINE = InnoDB;
-- ----------------------------------------------------- -- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`image` ( CREATE TABLE IF NOT EXISTS `rssto`.`image` (
`image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`provider_id` INT UNSIGNED NULL,
`sha256` CHAR(64) NOT NULL, `sha256` CHAR(64) NOT NULL,
`src` VARCHAR(2048) NULL, `src` VARCHAR(2048) NULL,
`url` VARCHAR(2048) NULL, `url` VARCHAR(2048) NULL,
`data` MEDIUMBLOB NOT NULL, `data` MEDIUMBLOB NOT NULL,
PRIMARY KEY (`image_id`), PRIMARY KEY (`image_id`),
UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE) UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE,
INDEX `fk_image_provider_idx` (`provider_id` ASC) VISIBLE,
CONSTRAINT `fk_image_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB; ENGINE = InnoDB;
-- ----------------------------------------------------- -- -----------------------------------------------------
-- Table `rssto`.`content_image` -- Table `rssto`.`channel_item_content_image`
-- ----------------------------------------------------- -- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`content_image` ( CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_image` (
`content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `channel_item_content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`content_id` BIGINT UNSIGNED NOT NULL, `content_channel_item_content_id` BIGINT UNSIGNED NOT NULL,
`image_id` BIGINT UNSIGNED NOT NULL, `image_id` BIGINT UNSIGNED NOT NULL,
PRIMARY KEY (`content_image_id`), PRIMARY KEY (`channel_item_content_image_id`),
INDEX `fk_content_image_content_idx` (`content_id` ASC) VISIBLE, INDEX `fk_channel_item_content_image_channel_item_content_idx` (`content_channel_item_content_id` ASC) VISIBLE,
INDEX `fk_content_image_image_idx` (`image_id` ASC) VISIBLE, INDEX `fk_channel_item_content_image_image_idx` (`image_id` ASC) VISIBLE,
CONSTRAINT `fk_content_image_content` CONSTRAINT `fk_channel_item_content_image_channel_item_content`
FOREIGN KEY (`content_id`) FOREIGN KEY (`content_channel_item_content_id`)
REFERENCES `rssto`.`content` (`content_id`) REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`)
ON DELETE NO ACTION ON DELETE NO ACTION
ON UPDATE NO ACTION, ON UPDATE NO ACTION,
CONSTRAINT `fk_content_image_image` CONSTRAINT `fk_channel_item_content_image_image`
FOREIGN KEY (`image_id`) FOREIGN KEY (`image_id`)
REFERENCES `rssto`.`image` (`image_id`) REFERENCES `rssto`.`image` (`image_id`)
ON DELETE NO ACTION ON DELETE NO ACTION
@ -124,6 +119,84 @@ CREATE TABLE IF NOT EXISTS `rssto`.`content_image` (
ENGINE = InnoDB; ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_description` (
`channel_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_id` INT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_description_id`),
INDEX `fk_channel_description_provider_idx` (`provider_id` ASC) VISIBLE,
INDEX `fk_channel_description_channel_idx` (`channel_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_description_channel`
FOREIGN KEY (`channel_id`)
REFERENCES `rssto`.`channel` (`channel_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_description` (
`channel_item_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
INDEX `fk_channel_item_description_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
INDEX `fk_channel_item_description_provider_idx` (`provider_id` ASC) VISIBLE,
PRIMARY KEY (`channel_item_description_id`),
UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_description_channel_item`
FOREIGN KEY (`channel_item_id`)
REFERENCES `rssto`.`channel_item` (`channel_item_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_item_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_content_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_description` (
`channel_item_content_description_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_content_id` BIGINT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_item_content_description_id`),
INDEX `fk_channel_item_content_description_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE,
INDEX `fk_channel_item_content_description_provider_idx` (`provider_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_item_content_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_description_channel_item_content`
FOREIGN KEY (`channel_item_content_id`)
REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_item_content_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
SET SQL_MODE=@OLD_SQL_MODE; SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;

View file

@ -19,54 +19,99 @@ impl Connection {
`channel_id`, `channel_id`,
`pub_date`, `pub_date`,
`guid`, `guid`,
`link`, `link` FROM `channel_item` WHERE `channel_item_id` = ?",
`title`,
`description` FROM `channel_item` WHERE `channel_item_id` = ?",
(channel_item_id,), (channel_item_id,),
) )
} }
pub fn content(&mut self, content_id: u64) -> Result<Option<Content>, Error> { pub fn channel_item_content(
&mut self,
channel_item_content_id: u64,
) -> Result<Option<ChannelItemContent>, Error> {
self.conn.exec_first( self.conn.exec_first(
"SELECT `content_id`, "SELECT `channel_item_content_id`,
`channel_item_id`, `channel_item_id`
`provider_id`, FROM `channel_item_content` WHERE `channel_item_content_id` = ?",
`title`, (channel_item_content_id,),
`description` FROM `content` WHERE `content_id` = ?",
(content_id,),
) )
} }
pub fn contents_total_by_provider_id( pub fn channel_item_content_description(
&mut self,
channel_item_content_description_id: u64,
) -> Result<Option<ChannelItemContentDescription>, Error> {
self.conn.exec_first(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description` FROM `channel_item_content_description`
WHERE `channel_item_content_description_id` = ?",
(channel_item_content_description_id,),
)
}
pub fn channel_item_content_descriptions_total_by_provider_id(
&mut self, &mut self,
provider_id: Option<u64>, provider_id: Option<u64>,
keyword: Option<&str>, keyword: Option<&str>,
) -> Result<usize, Error> { ) -> Result<usize, Error> {
let total: Option<usize> = self.conn.exec_first( let total: Option<usize> = match keyword {
"SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?", Some(k) => self.conn.exec_first(
(provider_id, like(keyword)), "SELECT COUNT(*) FROM `channel_item_content_description`
)?; WHERE `provider_id` <=> ? AND `title` LIKE '%?%'",
(provider_id, k),
)?,
None => self.conn.exec_first(
"SELECT COUNT(*) FROM `channel_item_content_description`
WHERE `provider_id` <=> ?",
(provider_id,),
)?,
};
Ok(total.unwrap_or(0)) Ok(total.unwrap_or(0))
} }
pub fn contents_by_provider_id( pub fn channel_item_content_descriptions_by_provider_id(
&mut self, &mut self,
provider_id: Option<u64>, provider_id: Option<u64>,
keyword: Option<&str>, keyword: Option<&str>,
sort: Sort, sort: Sort,
start: Option<usize>, start: Option<usize>,
limit: Option<usize>, limit: Option<usize>,
) -> Result<Vec<Content>, Error> { ) -> Result<Vec<ChannelItemContentDescription>, Error> {
self.conn.exec(format!( match keyword {
"SELECT `content_id`, Some(k) => self.conn.exec(
`channel_item_id`, format!(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`, `provider_id`,
`title`, `title`,
`description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {},{}", `description`
FROM `channel_item_content_description`
WHERE `provider_id` <=> ? AND `title` LIKE '%?%'
ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}",
start.unwrap_or(0), start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT) limit.unwrap_or(DEFAULT_LIMIT)
), ),
(provider_id, like(keyword), )) (provider_id, k),
),
None => self.conn.exec(
format!(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description`
FROM `channel_item_content_description`
WHERE `provider_id` <=> ?
ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}",
start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id,),
),
}
} }
pub fn content_image(&mut self, content_image_id: u64) -> Result<Option<ContentImage>, Error> { pub fn content_image(&mut self, content_image_id: u64) -> Result<Option<ContentImage>, Error> {
@ -107,9 +152,4 @@ impl Connection {
} }
} }
/// Shared search logic
fn like(value: Option<&str>) -> String {
value.map_or("%".into(), |k| format!("{k}%"))
}
const DEFAULT_LIMIT: usize = 100; const DEFAULT_LIMIT: usize = 100;

View file

@ -13,19 +13,30 @@ pub struct ChannelItem {
pub pub_date: i64, pub pub_date: i64,
pub guid: String, pub guid: String,
pub link: String, pub link: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItemDescription {
pub channel_item_description_id: u64,
pub channel_item_id: u64,
pub provider_id: Option<u64>,
pub title: Option<String>, pub title: Option<String>,
pub description: Option<String>, pub description: Option<String>,
} }
#[derive(Debug, PartialEq, Eq, FromRow)] #[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Content { pub struct ChannelItemContent {
pub content_id: u64, pub channel_item_content_id: u64,
pub channel_item_id: u64, pub channel_item_id: u64,
/// None if the original `title` and `description` values }
/// parsed from the channel item on crawl
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItemContentDescription {
pub channel_item_content_description_id: u64,
pub channel_item_content_id: u64,
pub provider_id: Option<u64>, pub provider_id: Option<u64>,
pub title: String, pub title: Option<String>,
pub description: String, pub description: Option<String>,
} }
#[derive(Debug, PartialEq, Eq, FromRow)] #[derive(Debug, PartialEq, Eq, FromRow)]
@ -37,6 +48,7 @@ pub struct Provider {
#[derive(Debug, PartialEq, Eq, FromRow)] #[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Image { pub struct Image {
pub image_id: u64, pub image_id: u64,
pub provider_id: Option<u64>,
/// Keep image unique by comparing its data hash /// Keep image unique by comparing its data hash
pub sha256: String, pub sha256: String,
/// Original `src` tag value to post-replacing /// Original `src` tag value to post-replacing

View file

@ -36,6 +36,23 @@ impl Transaction {
Ok(self.tx.last_insert_id().unwrap()) Ok(self.tx.last_insert_id().unwrap())
} }
pub fn insert_channel_description(
&mut self,
channel_id: u64,
provider_id: Option<u64>,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_description` SET `channel_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn channel_items_total_by_channel_id_guid( pub fn channel_items_total_by_channel_id_guid(
&mut self, &mut self,
channel_id: u64, channel_id: u64,
@ -56,49 +73,26 @@ impl Transaction {
pub_date: i64, pub_date: i64,
guid: &str, guid: &str,
link: &str, link: &str,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> { ) -> Result<u64, Error> {
self.tx.exec_drop( self.tx.exec_drop(
"INSERT INTO `channel_item` SET `channel_id` = ?, "INSERT INTO `channel_item` SET `channel_id` = ?,
`pub_date` = ?, `pub_date` = ?,
`guid` = ?, `guid` = ?,
`link` = ?, `link` = ?",
`title` = ?, (channel_id, pub_date, guid, link),
`description` = ?",
(channel_id, pub_date, guid, link, title, description),
)?; )?;
Ok(self.tx.last_insert_id().unwrap()) Ok(self.tx.last_insert_id().unwrap())
} }
pub fn contents_queue_for_provider_id( pub fn insert_channel_item_description(
&mut self,
provider_id: u64,
) -> Result<Vec<Content>, Error> {
self.tx.exec(
"SELECT `c1`.`content_id`,
`c1`.`channel_item_id`,
`c1`.`provider_id`,
`c1`.`title`,
`c1`.`description`
FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS (
SELECT NULL FROM `content` AS `c2`
WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id`
AND `c2`.`provider_id` = ? LIMIT 1
)",
(provider_id,),
)
}
pub fn insert_content(
&mut self, &mut self,
channel_item_id: u64, channel_item_id: u64,
provider_id: Option<u64>, provider_id: Option<u64>,
title: &str, title: Option<String>,
description: &str, description: Option<String>,
) -> Result<u64, Error> { ) -> Result<u64, Error> {
self.tx.exec_drop( self.tx.exec_drop(
"INSERT INTO `content` SET `channel_item_id` = ?, "INSERT INTO `channel_item_description` SET `channel_item_id` = ?,
`provider_id` = ?, `provider_id` = ?,
`title` = ?, `title` = ?,
`description` = ?", `description` = ?",
@ -107,15 +101,60 @@ impl Transaction {
Ok(self.tx.last_insert_id().unwrap()) Ok(self.tx.last_insert_id().unwrap())
} }
pub fn replace_content_description( pub fn channel_item_content_descriptions_queue_for_provider_id(
&mut self,
provider_id: u64,
) -> Result<Vec<ChannelItemContentDescription>, Error> {
self.tx.exec(
"SELECT `t1`.`content_id`,
`t1`.`channel_item_id`,
`t1`.`provider_id`,
`t1`.`title`,
`t1`.`description`
FROM `channel_item_content_description` AS `t1`
WHERE `t1`.`provider_id` IS NULL AND NOT EXISTS (
SELECT NULL FROM `channel_item_content_description` AS `t2`
WHERE `t2`.`channel_item_id` = `t1`.`channel_item_id`
AND `t2`.`provider_id` = ? LIMIT 1
)",
(provider_id,),
)
}
pub fn insert_channel_item_content(&mut self, channel_item_id: u64) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_content` SET `channel_item_id` = ?",
(channel_item_id,),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn insert_channel_item_content_description(
&mut self,
channel_item_content_id: u64,
provider_id: Option<u64>,
title: Option<&str>,
description: Option<&str>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_content_description` SET `channel_item_content_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_item_content_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn replace_channel_item_content_description(
&mut self, &mut self,
content_id: u64, content_id: u64,
from: &str, from: &str,
to: &str, to: &str,
) -> Result<(), Error> { ) -> Result<(), Error> {
self.tx.exec_drop( self.tx.exec_drop(
"UPDATE `content` SET `description` = REPLACE(`description`, ?, ?) "UPDATE `channel_item_content_description`
WHERE`content_id` = ?", SET `description` = REPLACE(`description`, ?, ?) WHERE`content_id` = ?",
(from, to, content_id), (from, to, content_id),
) )
} }