normalize db tables, optionally persist channel descriptions, remove entries logic from the crawler, update config options

This commit is contained in:
yggverse 2026-01-11 20:36:00 +02:00
parent 7e4d9e3ed6
commit 2b804d8915
10 changed files with 500 additions and 249 deletions

View file

@ -1,5 +1,5 @@
-- MySQL Script generated by MySQL Workbench
-- сб, 10-січ-2026 14:27:50 +0200
-- нд, 11-січ-2026 20:33:40 +0200
-- Model: New Model Version: 1.0
-- MySQL Workbench Forward Engineering
@ -21,7 +21,7 @@ USE `rssto` ;
-- Table `rssto`.`channel`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel` (
`channel_id` INT NOT NULL AUTO_INCREMENT,
`channel_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`url` VARCHAR(255) NOT NULL,
PRIMARY KEY (`channel_id`),
UNIQUE INDEX `url_UNIQUE` (`url` ASC) VISIBLE)
@ -32,14 +32,12 @@ ENGINE = InnoDB;
-- Table `rssto`.`channel_item`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` (
`channel_item_id` INT NOT NULL AUTO_INCREMENT,
`channel_id` INT NOT NULL,
`channel_item_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_id` INT UNSIGNED NOT NULL,
`pub_date` BIGINT NOT NULL,
`guid` VARCHAR(255) NOT NULL,
`link` VARCHAR(255) NOT NULL,
`title` VARCHAR(255) NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_item_id`),
PRIMARY KEY (`channel_item_id`, `channel_id`),
INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`guid` ASC, `channel_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_channel`
@ -54,7 +52,7 @@ ENGINE = InnoDB;
-- Table `rssto`.`provider`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`provider` (
`provider_id` INT NOT NULL AUTO_INCREMENT,
`provider_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`name` VARCHAR(255) NOT NULL,
PRIMARY KEY (`provider_id`),
UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE)
@ -62,27 +60,17 @@ ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`content`
-- Table `rssto`.`channel_item_content`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`content` (
`content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT NOT NULL,
`provider_id` INT NULL,
`title` VARCHAR(255) NOT NULL,
`description` LONGTEXT NOT NULL,
PRIMARY KEY (`content_id`),
INDEX `fk_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
INDEX `fk_content_provider_idx` (`provider_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_content_channel_item`
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content` (
`channel_item_content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT UNSIGNED NOT NULL,
PRIMARY KEY (`channel_item_content_id`, `channel_item_id`),
INDEX `fk_channel_item_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_channel_item`
FOREIGN KEY (`channel_item_id`)
REFERENCES `rssto`.`channel_item` (`channel_item_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_content_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
@ -92,31 +80,38 @@ ENGINE = InnoDB;
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`image` (
`image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`provider_id` INT UNSIGNED NULL,
`sha256` CHAR(64) NOT NULL,
`src` VARCHAR(2048) NULL,
`url` VARCHAR(2048) NULL,
`data` MEDIUMBLOB NOT NULL,
PRIMARY KEY (`image_id`),
UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE)
UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE,
INDEX `fk_image_provider_idx` (`provider_id` ASC) VISIBLE,
CONSTRAINT `fk_image_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`content_image`
-- Table `rssto`.`channel_item_content_image`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`content_image` (
`content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`content_id` BIGINT UNSIGNED NOT NULL,
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_image` (
`channel_item_content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`content_channel_item_content_id` BIGINT UNSIGNED NOT NULL,
`image_id` BIGINT UNSIGNED NOT NULL,
PRIMARY KEY (`content_image_id`),
INDEX `fk_content_image_content_idx` (`content_id` ASC) VISIBLE,
INDEX `fk_content_image_image_idx` (`image_id` ASC) VISIBLE,
CONSTRAINT `fk_content_image_content`
FOREIGN KEY (`content_id`)
REFERENCES `rssto`.`content` (`content_id`)
PRIMARY KEY (`channel_item_content_image_id`),
INDEX `fk_channel_item_content_image_channel_item_content_idx` (`content_channel_item_content_id` ASC) VISIBLE,
INDEX `fk_channel_item_content_image_image_idx` (`image_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_image_channel_item_content`
FOREIGN KEY (`content_channel_item_content_id`)
REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_content_image_image`
CONSTRAINT `fk_channel_item_content_image_image`
FOREIGN KEY (`image_id`)
REFERENCES `rssto`.`image` (`image_id`)
ON DELETE NO ACTION
@ -124,6 +119,84 @@ CREATE TABLE IF NOT EXISTS `rssto`.`content_image` (
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_description` (
`channel_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_id` INT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_description_id`),
INDEX `fk_channel_description_provider_idx` (`provider_id` ASC) VISIBLE,
INDEX `fk_channel_description_channel_idx` (`channel_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_description_channel`
FOREIGN KEY (`channel_id`)
REFERENCES `rssto`.`channel` (`channel_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_description` (
`channel_item_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
INDEX `fk_channel_item_description_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
INDEX `fk_channel_item_description_provider_idx` (`provider_id` ASC) VISIBLE,
PRIMARY KEY (`channel_item_description_id`),
UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_description_channel_item`
FOREIGN KEY (`channel_item_id`)
REFERENCES `rssto`.`channel_item` (`channel_item_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_item_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_content_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_description` (
`channel_item_content_description_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_content_id` BIGINT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_item_content_description_id`),
INDEX `fk_channel_item_content_description_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE,
INDEX `fk_channel_item_content_description_provider_idx` (`provider_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_item_content_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_description_channel_item_content`
FOREIGN KEY (`channel_item_content_id`)
REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_item_content_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;

View file

@ -19,54 +19,99 @@ impl Connection {
`channel_id`,
`pub_date`,
`guid`,
`link`,
`title`,
`description` FROM `channel_item` WHERE `channel_item_id` = ?",
`link` FROM `channel_item` WHERE `channel_item_id` = ?",
(channel_item_id,),
)
}
pub fn content(&mut self, content_id: u64) -> Result<Option<Content>, Error> {
pub fn channel_item_content(
&mut self,
channel_item_content_id: u64,
) -> Result<Option<ChannelItemContent>, Error> {
self.conn.exec_first(
"SELECT `content_id`,
`channel_item_id`,
`provider_id`,
`title`,
`description` FROM `content` WHERE `content_id` = ?",
(content_id,),
"SELECT `channel_item_content_id`,
`channel_item_id`
FROM `channel_item_content` WHERE `channel_item_content_id` = ?",
(channel_item_content_id,),
)
}
pub fn contents_total_by_provider_id(
pub fn channel_item_content_description(
&mut self,
channel_item_content_description_id: u64,
) -> Result<Option<ChannelItemContentDescription>, Error> {
self.conn.exec_first(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description` FROM `channel_item_content_description`
WHERE `channel_item_content_description_id` = ?",
(channel_item_content_description_id,),
)
}
pub fn channel_item_content_descriptions_total_by_provider_id(
&mut self,
provider_id: Option<u64>,
keyword: Option<&str>,
) -> Result<usize, Error> {
let total: Option<usize> = self.conn.exec_first(
"SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?",
(provider_id, like(keyword)),
)?;
let total: Option<usize> = match keyword {
Some(k) => self.conn.exec_first(
"SELECT COUNT(*) FROM `channel_item_content_description`
WHERE `provider_id` <=> ? AND `title` LIKE '%?%'",
(provider_id, k),
)?,
None => self.conn.exec_first(
"SELECT COUNT(*) FROM `channel_item_content_description`
WHERE `provider_id` <=> ?",
(provider_id,),
)?,
};
Ok(total.unwrap_or(0))
}
pub fn contents_by_provider_id(
pub fn channel_item_content_descriptions_by_provider_id(
&mut self,
provider_id: Option<u64>,
keyword: Option<&str>,
sort: Sort,
start: Option<usize>,
limit: Option<usize>,
) -> Result<Vec<Content>, Error> {
self.conn.exec(format!(
"SELECT `content_id`,
`channel_item_id`,
`provider_id`,
`title`,
`description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {},{}",
start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id, like(keyword), ))
) -> Result<Vec<ChannelItemContentDescription>, Error> {
match keyword {
Some(k) => self.conn.exec(
format!(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description`
FROM `channel_item_content_description`
WHERE `provider_id` <=> ? AND `title` LIKE '%?%'
ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}",
start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id, k),
),
None => self.conn.exec(
format!(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description`
FROM `channel_item_content_description`
WHERE `provider_id` <=> ?
ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}",
start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id,),
),
}
}
pub fn content_image(&mut self, content_image_id: u64) -> Result<Option<ContentImage>, Error> {
@ -107,9 +152,4 @@ impl Connection {
}
}
/// Shared search logic
fn like(value: Option<&str>) -> String {
value.map_or("%".into(), |k| format!("{k}%"))
}
const DEFAULT_LIMIT: usize = 100;

View file

@ -13,19 +13,30 @@ pub struct ChannelItem {
pub pub_date: i64,
pub guid: String,
pub link: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItemDescription {
pub channel_item_description_id: u64,
pub channel_item_id: u64,
pub provider_id: Option<u64>,
pub title: Option<String>,
pub description: Option<String>,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Content {
pub content_id: u64,
pub struct ChannelItemContent {
pub channel_item_content_id: u64,
pub channel_item_id: u64,
/// None if the original `title` and `description` values
/// parsed from the channel item on crawl
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItemContentDescription {
pub channel_item_content_description_id: u64,
pub channel_item_content_id: u64,
pub provider_id: Option<u64>,
pub title: String,
pub description: String,
pub title: Option<String>,
pub description: Option<String>,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
@ -37,6 +48,7 @@ pub struct Provider {
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Image {
pub image_id: u64,
pub provider_id: Option<u64>,
/// Keep image unique by comparing its data hash
pub sha256: String,
/// Original `src` tag value to post-replacing

View file

@ -36,6 +36,23 @@ impl Transaction {
Ok(self.tx.last_insert_id().unwrap())
}
pub fn insert_channel_description(
&mut self,
channel_id: u64,
provider_id: Option<u64>,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_description` SET `channel_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn channel_items_total_by_channel_id_guid(
&mut self,
channel_id: u64,
@ -56,66 +73,88 @@ impl Transaction {
pub_date: i64,
guid: &str,
link: &str,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item` SET `channel_id` = ?,
`pub_date` = ?,
`guid` = ?,
`link` = ?,
`title` = ?,
`description` = ?",
(channel_id, pub_date, guid, link, title, description),
`link` = ?",
(channel_id, pub_date, guid, link),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn contents_queue_for_provider_id(
&mut self,
provider_id: u64,
) -> Result<Vec<Content>, Error> {
self.tx.exec(
"SELECT `c1`.`content_id`,
`c1`.`channel_item_id`,
`c1`.`provider_id`,
`c1`.`title`,
`c1`.`description`
FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS (
SELECT NULL FROM `content` AS `c2`
WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id`
AND `c2`.`provider_id` = ? LIMIT 1
)",
(provider_id,),
)
}
pub fn insert_content(
pub fn insert_channel_item_description(
&mut self,
channel_item_id: u64,
provider_id: Option<u64>,
title: &str,
description: &str,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `content` SET `channel_item_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
"INSERT INTO `channel_item_description` SET `channel_item_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_item_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn replace_content_description(
pub fn channel_item_content_descriptions_queue_for_provider_id(
&mut self,
provider_id: u64,
) -> Result<Vec<ChannelItemContentDescription>, Error> {
self.tx.exec(
"SELECT `t1`.`content_id`,
`t1`.`channel_item_id`,
`t1`.`provider_id`,
`t1`.`title`,
`t1`.`description`
FROM `channel_item_content_description` AS `t1`
WHERE `t1`.`provider_id` IS NULL AND NOT EXISTS (
SELECT NULL FROM `channel_item_content_description` AS `t2`
WHERE `t2`.`channel_item_id` = `t1`.`channel_item_id`
AND `t2`.`provider_id` = ? LIMIT 1
)",
(provider_id,),
)
}
pub fn insert_channel_item_content(&mut self, channel_item_id: u64) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_content` SET `channel_item_id` = ?",
(channel_item_id,),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn insert_channel_item_content_description(
&mut self,
channel_item_content_id: u64,
provider_id: Option<u64>,
title: Option<&str>,
description: Option<&str>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_content_description` SET `channel_item_content_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_item_content_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn replace_channel_item_content_description(
&mut self,
content_id: u64,
from: &str,
to: &str,
) -> Result<(), Error> {
self.tx.exec_drop(
"UPDATE `content` SET `description` = REPLACE(`description`, ?, ?)
WHERE`content_id` = ?",
"UPDATE `channel_item_content_description`
SET `description` = REPLACE(`description`, ?, ?) WHERE`content_id` = ?",
(from, to, content_id),
)
}