normalize db tables, optionally persist channel descriptions, remove entries logic from the crawler, update config options

This commit is contained in:
yggverse 2026-01-11 20:36:00 +02:00
parent 7e4d9e3ed6
commit 2b804d8915
10 changed files with 500 additions and 249 deletions

View file

@ -18,25 +18,28 @@ update = 900
url = "https://1"
# Limit latest channel items to crawl (unlimited by default)
items_limit = 20
items_limit = 5
# Save Channel item title in the database (currently not in use)
persist_item_title = true
# Save Channel `title` and `description` in the database (currently not in use)
persist_description = true
#Save Channel item description in the database (currently not in use)
# Save Channel item `title` and `description` in the database
persist_item_description = true
# Allowed tags
# * empty to strip all tags (default)
allowed_tags = []
allowed_tags = ["a", "br", "p", "img"]
# Grab Channel item content (from the item `link`)
scrape_item_content = false
# Scrape title by CSS selector
# * None to use Channel item title if exists or fail to continue
# content_title_selector = "h1"
# scrape_item_content_title_selector = "h1"
# Scrape description by CSS selector
# * None to use Channel item description if exists or fail to continue
# content_description_selector = "article"
# scrape_item_content_description_selector = "article"
# Preload content images locally if `Some`
# * currently stored in the database
@ -49,25 +52,28 @@ update = 900
url = "https://2"
# Limit latest channel items to crawl (unlimited by default)
items_limit = 20
items_limit = 5
# Save Channel item title in the database (currently not in use)
persist_item_title = true
# Save Channel `title` and `description` in the database (currently not in use)
persist_description = true
#Save Channel item description in the database (currently not in use)
# Save Channel item `title` and `description` in the database
persist_item_description = true
# Allowed tags
# * empty to strip all tags (default)
allowed_tags = []
allowed_tags = ["a", "br", "p", "img"]
# Grab Channel item content (from the item `link`)
scrape_item_content = false
# Scrape title by CSS selector
# * None to use Channel item title if exists or fail to continue
# content_title_selector = "h1"
# scrape_item_content_title_selector = "h1"
# Scrape description by CSS selector
# * None to use Channel item description if exists or fail to continue
# content_description_selector = "article"
# scrape_item_content_description_selector = "article"
# Preload content images locally if `Some`
# * currently stored in the database

View file

@ -17,16 +17,18 @@ pub struct Channel {
pub url: Url,
/// Limit latest channel items to crawl (unlimited by default)
pub items_limit: Option<usize>,
/// Save Channel item title in the database (currently not in use)
pub persist_item_title: bool,
/// Save Channel item description in the database (currently not in use)
/// Save Channel title and description in the database
pub persist_description: bool,
/// Save Channel item title and description in the database
pub persist_item_description: bool,
/// Grab Channel item content (from the item `link`)
pub scrape_item_content: bool,
/// Scrape title by CSS selector
/// * None to use Channel item title if exists or fail to continue
pub content_title_selector: Option<Selector>,
pub scrape_item_content_title_selector: Option<Selector>,
/// Scrape description by CSS selector
/// * None to use Channel item description if exists or fail to continue
pub content_description_selector: Option<Selector>,
pub scrape_item_content_description_selector: Option<Selector>,
/// Allowed tags
/// * empty to strip all tags (default)
pub allowed_tags: std::collections::HashSet<String>,

View file

@ -72,14 +72,6 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
let channel_url = channel_config.url.to_string(); // allocate once
let channel_items =
match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
Ok(response) => response.into_items(),
Err(e) => bail!("Could not parse response: `{e}`"),
};
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
let channel_id = match tx.channel_id_by_url(&channel_url)? {
Some(channel_id) => channel_id,
None => {
@ -89,6 +81,28 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
}
};
let channel_items =
match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
Ok(channel) => {
if channel_config.persist_description {
let channel_description_id = tx.insert_channel_description(
channel_id,
None,
Some(strip_tags(channel.title(), None)),
Some(strip_tags(
channel.description(),
Some(&channel_config.allowed_tags),
)),
)?;
debug!("Save channel description #{channel_description_id}")
}
channel.into_items()
}
Err(e) => bail!("Could not parse response: `{e}`"),
};
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
for channel_item in channel_items.iter().take(channel_items_limit) {
let guid = match channel_item.guid {
Some(ref guid) => guid.value.as_ref(),
@ -106,72 +120,62 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
None => bail!("Undefined `pub_date`"),
};
if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 {
debug!("Channel item `{guid}` already exists, skipped.");
continue; // skip next steps as processed
}
let channel_item_id = tx.insert_channel_item(
channel_id,
pub_date,
guid,
link,
if channel_config.persist_item_title {
channel_item.title().map(|s| strip_tags(s, None))
} else {
None
},
if channel_config.persist_item_description {
let channel_item_id = tx.insert_channel_item(channel_id, pub_date, guid, link)?;
info!("Register new channel item #{channel_item_id} ({link})");
if channel_config.persist_item_description {
let channel_item_description_id = tx.insert_channel_item_description(
channel_item_id,
None,
channel_item.title().map(|s| strip_tags(s, None)),
channel_item
.description()
.map(|s| strip_tags(s, Some(&channel_config.allowed_tags)))
} else {
None
},
)?;
info!("Register new channel item #{channel_item_id} ({link})");
.map(|s| strip_tags(s, Some(&channel_config.allowed_tags))),
)?;
debug!("Save channel item description #{channel_item_description_id}")
}
// preload remote content..
if !channel_config.scrape_item_content {
continue;
}
let channel_item_content_id = tx.insert_channel_item_content(channel_item_id)?;
info!("Add new content record #{channel_item_content_id}");
let html = scraper::Html::parse_document(&get(link)?.text()?);
let description = strip_tags(
&match channel_config.content_description_selector {
Some(ref selector) => match html.select(selector).next() {
Some(description) => description.inner_html(),
None => bail!("Could not scrape `description` selector from `{link}`"),
},
None => match channel_item.description {
Some(ref description) => description.clone(),
None => {
bail!("Could not assign `description` from channel item for `{link}`")
}
},
let description = match channel_config.scrape_item_content_description_selector {
Some(ref selector) => match html.select(selector).next() {
Some(description) => Some(strip_tags(
&description.inner_html(),
Some(&channel_config.allowed_tags),
)),
None => bail!("Could not scrape `description` selector from `{link}`"),
},
Some(&channel_config.allowed_tags),
);
let content_id = tx.insert_content(
channel_item_id,
None => None,
};
let channel_item_content_description_id = tx.insert_channel_item_content_description(
channel_item_content_id,
None,
strip_tags(
&match channel_config.content_title_selector {
Some(ref selector) => match html.select(selector).next() {
Some(title) => title.inner_html(),
None => bail!("Could not scrape `title` selector from `{link}`"),
},
None => match channel_item.title {
Some(ref title) => title.clone(),
None => {
bail!(
"Could not assign `title` from channel item for content in `{link}`"
)
}
},
match channel_config.scrape_item_content_title_selector {
Some(ref selector) => match html.select(selector).next() {
Some(title) => Some(strip_tags(&title.inner_html(), None)),
None => bail!("Could not scrape `title` selector from `{link}`"),
},
None,
)
.trim(),
description.trim(),
None => None,
}
.as_ref()
.map(|s| s.trim()),
description.as_ref().map(|s| s.trim()),
)?;
info!("Add new content record #{content_id}");
debug!("Save channel item content description #{channel_item_content_description_id}");
// persist images if enabled
if let Some(ref selector) = channel_config.persist_images_selector {
use sha2::{Digest, Sha256};
for element in scraper::Html::parse_document(&description).select(selector) {
if description.is_none() {
bail!("Field `description` is required to scrape images from `{link}`")
}
for element in scraper::Html::parse_document(&description.unwrap()).select(selector) {
if let Some(src) = element.value().attr("src") {
let absolute = match Url::parse(src) {
Ok(url) => url,
@ -197,10 +201,15 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul
image_id
}
};
let content_image_id = tx.insert_content_image(content_id, image_id)?;
let content_image_id =
tx.insert_content_image(channel_item_content_id, image_id)?;
debug!("Add content image relationship #{content_image_id}");
let uri = format!("/image/{image_id}");
tx.replace_content_description(content_id, src, &uri)?;
tx.replace_channel_item_content_description(
channel_item_content_id,
src,
&uri,
)?;
debug!("Replace content image in description from `{src}` to `{uri}`")
}
}

View file

@ -31,7 +31,7 @@ fn index(
#[derive(Serialize)]
#[serde(crate = "rocket::serde")]
struct Row {
content_id: u64,
channel_item_content_description_id: u64,
link: String,
time: String,
title: String,
@ -41,7 +41,7 @@ fn index(
Status::InternalServerError
})?;
let total = conn
.contents_total_by_provider_id(global.provider_id, search)
.channel_item_content_descriptions_total_by_provider_id(global.provider_id, search)
.map_err(|e| {
error!("Could not get contents total: `{e}`");
Status::InternalServerError
@ -73,7 +73,7 @@ fn index(
back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))),
next: if page.unwrap_or(1) * global.list_limit >= total { None }
else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) },
rows: conn.contents_by_provider_id(
rows: conn.channel_item_content_descriptions_by_provider_id(
global.provider_id,
search,
Sort::Desc,
@ -84,13 +84,16 @@ fn index(
Status::InternalServerError
})?
.into_iter()
.map(|content| {
let channel_item = conn.channel_item(content.channel_item_id).unwrap().unwrap();
.map(|channel_item_content_description| {
let channel_item = conn.channel_item(
channel_item_content_description.channel_item_content_id
).unwrap().unwrap();
Row {
content_id: content.content_id,
channel_item_content_description_id:
channel_item_content_description.channel_item_content_description_id,
link: channel_item.link,
time: time(channel_item.pub_date).format(&global.format_time).to_string(),
title: content.title,
title: channel_item_content_description.title.unwrap_or_default(), // @TODO handle
}
})
.collect::<Vec<Row>>(),
@ -102,9 +105,9 @@ fn index(
))
}
#[get("/<content_id>")]
#[get("/<channel_item_content_description_id>")]
fn info(
content_id: u64,
channel_item_content_description_id: u64,
db: &State<Database>,
meta: &State<Meta>,
global: &State<Global>,
@ -113,29 +116,52 @@ fn info(
error!("Could not connect database: `{e}`");
Status::InternalServerError
})?;
match conn.content(content_id).map_err(|e| {
error!("Could not get content `{content_id}`: `{e}`");
match conn.channel_item_content_description(channel_item_content_description_id).map_err(|e| {
error!("Could not get `channel_item_content_description_id` {channel_item_content_description_id}: `{e}`");
Status::InternalServerError
})? {
Some(content) => {
let channel_item = conn
.channel_item(content.channel_item_id)
Some(channel_item_content_description) => {
let channel_item_content = conn
.channel_item_content(channel_item_content_description.channel_item_content_id)
.map_err(|e| {
error!("Could not get requested channel item: `{e}`");
error!(
"Could not get requested `channel_item_content` #{}: `{e}`",
channel_item_content_description.channel_item_content_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!("Could not find requested channel item");
error!(
"Could not find requested `channel_item_content` #{}",
channel_item_content_description.channel_item_content_id
);
Status::NotFound
})?;
let channel_item = conn
.channel_item(channel_item_content.channel_item_id)
.map_err(|e| {
error!(
"Could not get requested `channel_item` #{}: `{e}`",
channel_item_content.channel_item_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!(
"Could not find requested `channel_item` #{}",
channel_item_content.channel_item_id
);
Status::NotFound
})?;
let title = channel_item_content_description.title.unwrap_or_default(); // @TODO handle
Ok(Template::render(
"info",
context! {
description: content.description,
description: channel_item_content_description.description,
link: channel_item.link,
meta: meta.inner(),
title: format!("{}{S}{}", content.title, meta.title),
name: content.title,
title: format!("{title}{S}{}", meta.title),
name: title,
time: time(channel_item.pub_date).format(&global.format_time).to_string(),
},
))
@ -175,8 +201,8 @@ fn rss(
error!("Could not connect database: `{e}`");
Status::InternalServerError
})?;
for content in conn
.contents_by_provider_id(
for channel_item_content_description in conn
.channel_item_content_descriptions_by_provider_id(
global.provider_id,
search,
Sort::Desc,
@ -184,26 +210,53 @@ fn rss(
Some(global.list_limit),
)
.map_err(|e| {
error!("Could not load channel item contents: `{e}`");
error!(
"Could not load `channel_item_content_description` for `provider` #{:?}: `{e}`",
global.provider_id
);
Status::InternalServerError
})?
{
let channel_item = conn
.channel_item(content.channel_item_id)
let channel_item_content = conn
.channel_item_content(channel_item_content_description.channel_item_content_id)
.map_err(|e| {
error!("Could not get requested channel item: `{e}`");
error!(
"Could not get requested `channel_item_content` #{}: `{e}`",
channel_item_content_description.channel_item_content_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!("Could not find requested channel item");
error!(
"Could not find requested `channel_item_content` #{}",
channel_item_content_description.channel_item_content_id
);
Status::NotFound
})?;
let channel_item = conn
.channel_item(channel_item_content.channel_item_id)
.map_err(|e| {
error!(
"Could not get requested `channel_item` #{}: `{e}`",
channel_item_content.channel_item_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!(
"Could not find requested `channel_item` #{}",
channel_item_content.channel_item_id
);
Status::NotFound
})?;
feed.push(
content.channel_item_id,
channel_item_content_description.channel_item_content_description_id,
time(channel_item.pub_date),
channel_item.link,
content.title,
content.description,
channel_item_content_description.title.unwrap_or_default(), // @TODO handle
channel_item_content_description
.description
.unwrap_or_default(), // @TODO handle
)
}
Ok(RawXml(feed.commit()))

View file

@ -3,13 +3,15 @@
{% if rows %}
{% for row in rows %}
<div>
<a name="{{ row.content_id }}"></a>
<h2><a href="{{ row.content_id }}">{{ row.title }}</a></h2>
<a name="{{ row.channel_item_content_description_id }}"></a>
<h2><a href="{{ row.channel_item_content_description_id }}">{{ row.title }}</a></h2>
<p>{{ row.time }}</p>
</div>
{% endfor %}
{% else %}
<div>Nothing.</div>
<div>
<p>Nothing.</p>
</div>
{% endif %}
{% if next %}<a href="{{ next }}">Next</a>{% endif %}
{% if back %}<a href="{{ back }}">Back</a>{% endif %}

View file

@ -67,35 +67,50 @@ async fn main() -> Result<()> {
loop {
debug!("New queue begin...");
let mut tx = db.transaction()?;
for source in tx.contents_queue_for_provider_id(provider_id)? {
for channel_item_content_description in
tx.channel_item_content_descriptions_queue_for_provider_id(provider_id)?
{
debug!(
"Begin generating `content_id` #{} using `provider_id` #{provider_id}.",
source.content_id
"Begin generating `channel_item_content_description` #{} using `provider_id` #{provider_id}.",
channel_item_content_description.channel_item_content_description_id
);
let title = llm
.chat_completion(ChatCompletionRequest::new(&config.llm.model).message(
Message::user(format!("{}\n{}", config.llm.message, source.title)),
))
.await?;
let description = llm
.chat_completion(ChatCompletionRequest::new(&config.llm.model).message(
Message::user(format!("{}\n{}", config.llm.message, source.description)),
))
.await?;
let content_id = tx.insert_content(
source.channel_item_id,
let title = match channel_item_content_description.title {
Some(subject) => Some(
llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message(
Message::user(format!("{}\n{}", config.llm.message, subject)),
))
.await?
.choices[0]
.message
.content
.trim()
.to_string(),
),
None => None,
};
let description = match channel_item_content_description.description {
Some(subject) => Some(
llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message(
Message::user(format!("{}\n{}", config.llm.message, subject)),
))
.await?
.choices[0]
.message
.content
.trim()
.to_string(),
),
None => None,
};
let channel_item_content_description_id = tx.insert_channel_item_content_description(
channel_item_content_description.channel_item_content_id,
Some(provider_id),
&title.choices[0].message.content,
&description.choices[0].message.content,
title.as_deref(),
description.as_deref(),
)?;
debug!(
"Created `content_id` #{content_id} using `content_id` #{} source by `provider_id` #{provider_id}.",
source.content_id
)
info!(
"Create `channel_item_content_description` #{channel_item_content_description_id} by `provider_id` #{provider_id}."
);
}
tx.commit()?;
debug!("Queue completed");

View file

@ -1,5 +1,5 @@
-- MySQL Script generated by MySQL Workbench
-- сб, 10-січ-2026 14:27:50 +0200
-- нд, 11-січ-2026 20:33:40 +0200
-- Model: New Model Version: 1.0
-- MySQL Workbench Forward Engineering
@ -21,7 +21,7 @@ USE `rssto` ;
-- Table `rssto`.`channel`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel` (
`channel_id` INT NOT NULL AUTO_INCREMENT,
`channel_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`url` VARCHAR(255) NOT NULL,
PRIMARY KEY (`channel_id`),
UNIQUE INDEX `url_UNIQUE` (`url` ASC) VISIBLE)
@ -32,14 +32,12 @@ ENGINE = InnoDB;
-- Table `rssto`.`channel_item`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` (
`channel_item_id` INT NOT NULL AUTO_INCREMENT,
`channel_id` INT NOT NULL,
`channel_item_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_id` INT UNSIGNED NOT NULL,
`pub_date` BIGINT NOT NULL,
`guid` VARCHAR(255) NOT NULL,
`link` VARCHAR(255) NOT NULL,
`title` VARCHAR(255) NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_item_id`),
PRIMARY KEY (`channel_item_id`, `channel_id`),
INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`guid` ASC, `channel_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_channel`
@ -54,7 +52,7 @@ ENGINE = InnoDB;
-- Table `rssto`.`provider`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`provider` (
`provider_id` INT NOT NULL AUTO_INCREMENT,
`provider_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`name` VARCHAR(255) NOT NULL,
PRIMARY KEY (`provider_id`),
UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE)
@ -62,27 +60,17 @@ ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`content`
-- Table `rssto`.`channel_item_content`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`content` (
`content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT NOT NULL,
`provider_id` INT NULL,
`title` VARCHAR(255) NOT NULL,
`description` LONGTEXT NOT NULL,
PRIMARY KEY (`content_id`),
INDEX `fk_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
INDEX `fk_content_provider_idx` (`provider_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_content_channel_item`
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content` (
`channel_item_content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT UNSIGNED NOT NULL,
PRIMARY KEY (`channel_item_content_id`, `channel_item_id`),
INDEX `fk_channel_item_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_channel_item`
FOREIGN KEY (`channel_item_id`)
REFERENCES `rssto`.`channel_item` (`channel_item_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_content_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
@ -92,31 +80,38 @@ ENGINE = InnoDB;
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`image` (
`image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`provider_id` INT UNSIGNED NULL,
`sha256` CHAR(64) NOT NULL,
`src` VARCHAR(2048) NULL,
`url` VARCHAR(2048) NULL,
`data` MEDIUMBLOB NOT NULL,
PRIMARY KEY (`image_id`),
UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE)
UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE,
INDEX `fk_image_provider_idx` (`provider_id` ASC) VISIBLE,
CONSTRAINT `fk_image_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`content_image`
-- Table `rssto`.`channel_item_content_image`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`content_image` (
`content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`content_id` BIGINT UNSIGNED NOT NULL,
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_image` (
`channel_item_content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`content_channel_item_content_id` BIGINT UNSIGNED NOT NULL,
`image_id` BIGINT UNSIGNED NOT NULL,
PRIMARY KEY (`content_image_id`),
INDEX `fk_content_image_content_idx` (`content_id` ASC) VISIBLE,
INDEX `fk_content_image_image_idx` (`image_id` ASC) VISIBLE,
CONSTRAINT `fk_content_image_content`
FOREIGN KEY (`content_id`)
REFERENCES `rssto`.`content` (`content_id`)
PRIMARY KEY (`channel_item_content_image_id`),
INDEX `fk_channel_item_content_image_channel_item_content_idx` (`content_channel_item_content_id` ASC) VISIBLE,
INDEX `fk_channel_item_content_image_image_idx` (`image_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_image_channel_item_content`
FOREIGN KEY (`content_channel_item_content_id`)
REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_content_image_image`
CONSTRAINT `fk_channel_item_content_image_image`
FOREIGN KEY (`image_id`)
REFERENCES `rssto`.`image` (`image_id`)
ON DELETE NO ACTION
@ -124,6 +119,84 @@ CREATE TABLE IF NOT EXISTS `rssto`.`content_image` (
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_description` (
`channel_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_id` INT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_description_id`),
INDEX `fk_channel_description_provider_idx` (`provider_id` ASC) VISIBLE,
INDEX `fk_channel_description_channel_idx` (`channel_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_description_channel`
FOREIGN KEY (`channel_id`)
REFERENCES `rssto`.`channel` (`channel_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_description` (
`channel_item_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
INDEX `fk_channel_item_description_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
INDEX `fk_channel_item_description_provider_idx` (`provider_id` ASC) VISIBLE,
PRIMARY KEY (`channel_item_description_id`),
UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_description_channel_item`
FOREIGN KEY (`channel_item_id`)
REFERENCES `rssto`.`channel_item` (`channel_item_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_item_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_content_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_description` (
`channel_item_content_description_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_content_id` BIGINT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_item_content_description_id`),
INDEX `fk_channel_item_content_description_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE,
INDEX `fk_channel_item_content_description_provider_idx` (`provider_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_item_content_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_description_channel_item_content`
FOREIGN KEY (`channel_item_content_id`)
REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_item_content_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;

View file

@ -19,54 +19,99 @@ impl Connection {
`channel_id`,
`pub_date`,
`guid`,
`link`,
`title`,
`description` FROM `channel_item` WHERE `channel_item_id` = ?",
`link` FROM `channel_item` WHERE `channel_item_id` = ?",
(channel_item_id,),
)
}
pub fn content(&mut self, content_id: u64) -> Result<Option<Content>, Error> {
pub fn channel_item_content(
&mut self,
channel_item_content_id: u64,
) -> Result<Option<ChannelItemContent>, Error> {
self.conn.exec_first(
"SELECT `content_id`,
`channel_item_id`,
`provider_id`,
`title`,
`description` FROM `content` WHERE `content_id` = ?",
(content_id,),
"SELECT `channel_item_content_id`,
`channel_item_id`
FROM `channel_item_content` WHERE `channel_item_content_id` = ?",
(channel_item_content_id,),
)
}
pub fn contents_total_by_provider_id(
pub fn channel_item_content_description(
&mut self,
channel_item_content_description_id: u64,
) -> Result<Option<ChannelItemContentDescription>, Error> {
self.conn.exec_first(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description` FROM `channel_item_content_description`
WHERE `channel_item_content_description_id` = ?",
(channel_item_content_description_id,),
)
}
pub fn channel_item_content_descriptions_total_by_provider_id(
&mut self,
provider_id: Option<u64>,
keyword: Option<&str>,
) -> Result<usize, Error> {
let total: Option<usize> = self.conn.exec_first(
"SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?",
(provider_id, like(keyword)),
)?;
let total: Option<usize> = match keyword {
Some(k) => self.conn.exec_first(
"SELECT COUNT(*) FROM `channel_item_content_description`
WHERE `provider_id` <=> ? AND `title` LIKE '%?%'",
(provider_id, k),
)?,
None => self.conn.exec_first(
"SELECT COUNT(*) FROM `channel_item_content_description`
WHERE `provider_id` <=> ?",
(provider_id,),
)?,
};
Ok(total.unwrap_or(0))
}
pub fn contents_by_provider_id(
pub fn channel_item_content_descriptions_by_provider_id(
&mut self,
provider_id: Option<u64>,
keyword: Option<&str>,
sort: Sort,
start: Option<usize>,
limit: Option<usize>,
) -> Result<Vec<Content>, Error> {
self.conn.exec(format!(
"SELECT `content_id`,
`channel_item_id`,
`provider_id`,
`title`,
`description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {},{}",
start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id, like(keyword), ))
) -> Result<Vec<ChannelItemContentDescription>, Error> {
match keyword {
Some(k) => self.conn.exec(
format!(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description`
FROM `channel_item_content_description`
WHERE `provider_id` <=> ? AND `title` LIKE '%?%'
ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}",
start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id, k),
),
None => self.conn.exec(
format!(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description`
FROM `channel_item_content_description`
WHERE `provider_id` <=> ?
ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}",
start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id,),
),
}
}
pub fn content_image(&mut self, content_image_id: u64) -> Result<Option<ContentImage>, Error> {
@ -107,9 +152,4 @@ impl Connection {
}
}
/// Shared search logic
fn like(value: Option<&str>) -> String {
value.map_or("%".into(), |k| format!("{k}%"))
}
const DEFAULT_LIMIT: usize = 100;

View file

@ -13,19 +13,30 @@ pub struct ChannelItem {
pub pub_date: i64,
pub guid: String,
pub link: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItemDescription {
pub channel_item_description_id: u64,
pub channel_item_id: u64,
pub provider_id: Option<u64>,
pub title: Option<String>,
pub description: Option<String>,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Content {
pub content_id: u64,
pub struct ChannelItemContent {
pub channel_item_content_id: u64,
pub channel_item_id: u64,
/// None if the original `title` and `description` values
/// parsed from the channel item on crawl
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItemContentDescription {
pub channel_item_content_description_id: u64,
pub channel_item_content_id: u64,
pub provider_id: Option<u64>,
pub title: String,
pub description: String,
pub title: Option<String>,
pub description: Option<String>,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
@ -37,6 +48,7 @@ pub struct Provider {
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Image {
pub image_id: u64,
pub provider_id: Option<u64>,
/// Keep image unique by comparing its data hash
pub sha256: String,
/// Original `src` tag value to post-replacing

View file

@ -36,6 +36,23 @@ impl Transaction {
Ok(self.tx.last_insert_id().unwrap())
}
pub fn insert_channel_description(
&mut self,
channel_id: u64,
provider_id: Option<u64>,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_description` SET `channel_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn channel_items_total_by_channel_id_guid(
&mut self,
channel_id: u64,
@ -56,66 +73,88 @@ impl Transaction {
pub_date: i64,
guid: &str,
link: &str,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item` SET `channel_id` = ?,
`pub_date` = ?,
`guid` = ?,
`link` = ?,
`title` = ?,
`description` = ?",
(channel_id, pub_date, guid, link, title, description),
`link` = ?",
(channel_id, pub_date, guid, link),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn contents_queue_for_provider_id(
&mut self,
provider_id: u64,
) -> Result<Vec<Content>, Error> {
self.tx.exec(
"SELECT `c1`.`content_id`,
`c1`.`channel_item_id`,
`c1`.`provider_id`,
`c1`.`title`,
`c1`.`description`
FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS (
SELECT NULL FROM `content` AS `c2`
WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id`
AND `c2`.`provider_id` = ? LIMIT 1
)",
(provider_id,),
)
}
pub fn insert_content(
pub fn insert_channel_item_description(
&mut self,
channel_item_id: u64,
provider_id: Option<u64>,
title: &str,
description: &str,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `content` SET `channel_item_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
"INSERT INTO `channel_item_description` SET `channel_item_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_item_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn replace_content_description(
pub fn channel_item_content_descriptions_queue_for_provider_id(
&mut self,
provider_id: u64,
) -> Result<Vec<ChannelItemContentDescription>, Error> {
self.tx.exec(
"SELECT `t1`.`content_id`,
`t1`.`channel_item_id`,
`t1`.`provider_id`,
`t1`.`title`,
`t1`.`description`
FROM `channel_item_content_description` AS `t1`
WHERE `t1`.`provider_id` IS NULL AND NOT EXISTS (
SELECT NULL FROM `channel_item_content_description` AS `t2`
WHERE `t2`.`channel_item_id` = `t1`.`channel_item_id`
AND `t2`.`provider_id` = ? LIMIT 1
)",
(provider_id,),
)
}
pub fn insert_channel_item_content(&mut self, channel_item_id: u64) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_content` SET `channel_item_id` = ?",
(channel_item_id,),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn insert_channel_item_content_description(
&mut self,
channel_item_content_id: u64,
provider_id: Option<u64>,
title: Option<&str>,
description: Option<&str>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_content_description` SET `channel_item_content_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_item_content_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn replace_channel_item_content_description(
&mut self,
content_id: u64,
from: &str,
to: &str,
) -> Result<(), Error> {
self.tx.exec_drop(
"UPDATE `content` SET `description` = REPLACE(`description`, ?, ?)
WHERE`content_id` = ?",
"UPDATE `channel_item_content_description`
SET `description` = REPLACE(`description`, ?, ?) WHERE`content_id` = ?",
(from, to, content_id),
)
}