diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml index 50f5a7e..7c2cb2b 100644 --- a/crates/crawler/config.toml +++ b/crates/crawler/config.toml @@ -18,25 +18,28 @@ update = 900 url = "https://1" # Limit latest channel items to crawl (unlimited by default) - items_limit = 20 + items_limit = 5 - # Save Channel item title in the database (currently not in use) - persist_item_title = true + # Save Channel `title` and `description` in the database (currently not in use) + persist_description = true - #Save Channel item description in the database (currently not in use) + # Save Channel item `title` and `description` in the database persist_item_description = true # Allowed tags # * empty to strip all tags (default) - allowed_tags = [] + allowed_tags = ["a", "br", "p", "img"] + + # Grab Channel item content (from the item `link`) + scrape_item_content = false # Scrape title by CSS selector # * None to use Channel item title if exists or fail to continue - # content_title_selector = "h1" + # scrape_item_content_title_selector = "h1" # Scrape description by CSS selector # * None to use Channel item description if exists or fail to continue - # content_description_selector = "article" + # scrape_item_content_description_selector = "article" # Preload content images locally if `Some` # * currently stored in the database @@ -49,25 +52,28 @@ update = 900 url = "https://2" # Limit latest channel items to crawl (unlimited by default) - items_limit = 20 + items_limit = 5 - # Save Channel item title in the database (currently not in use) - persist_item_title = true + # Save Channel `title` and `description` in the database (currently not in use) + persist_description = true - #Save Channel item description in the database (currently not in use) + # Save Channel item `title` and `description` in the database persist_item_description = true # Allowed tags # * empty to strip all tags (default) - allowed_tags = [] + allowed_tags = ["a", "br", "p", "img"] + + # Grab Channel item content (from the item `link`) + scrape_item_content = false # Scrape title by CSS selector # * None to use Channel item title if exists or fail to continue - # content_title_selector = "h1" + # scrape_item_content_title_selector = "h1" # Scrape description by CSS selector # * None to use Channel item description if exists or fail to continue - # content_description_selector = "article" + # scrape_item_content_description_selector = "article" # Preload content images locally if `Some` # * currently stored in the database diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs index cf2881b..63fe5a5 100644 --- a/crates/crawler/src/config.rs +++ b/crates/crawler/src/config.rs @@ -17,16 +17,18 @@ pub struct Channel { pub url: Url, /// Limit latest channel items to crawl (unlimited by default) pub items_limit: Option, - /// Save Channel item title in the database (currently not in use) - pub persist_item_title: bool, - /// Save Channel item description in the database (currently not in use) + /// Save Channel title and description in the database + pub persist_description: bool, + /// Save Channel item title and description in the database pub persist_item_description: bool, + /// Grab Channel item content (from the item `link`) + pub scrape_item_content: bool, /// Scrape title by CSS selector /// * None to use Channel item title if exists or fail to continue - pub content_title_selector: Option, + pub scrape_item_content_title_selector: Option, /// Scrape description by CSS selector /// * None to use Channel item description if exists or fail to continue - pub content_description_selector: Option, + pub scrape_item_content_description_selector: Option, /// Allowed tags /// * empty to strip all tags (default) pub allowed_tags: std::collections::HashSet, diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 110092b..62ecd8d 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -72,14 +72,6 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul let channel_url = channel_config.url.to_string(); // allocate once - let channel_items = - match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { - Ok(response) => response.into_items(), - Err(e) => bail!("Could not parse response: `{e}`"), - }; - - let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); - let channel_id = match tx.channel_id_by_url(&channel_url)? { Some(channel_id) => channel_id, None => { @@ -89,6 +81,28 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul } }; + let channel_items = + match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { + Ok(channel) => { + if channel_config.persist_description { + let channel_description_id = tx.insert_channel_description( + channel_id, + None, + Some(strip_tags(channel.title(), None)), + Some(strip_tags( + channel.description(), + Some(&channel_config.allowed_tags), + )), + )?; + debug!("Save channel description #{channel_description_id}") + } + channel.into_items() + } + Err(e) => bail!("Could not parse response: `{e}`"), + }; + + let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); + for channel_item in channel_items.iter().take(channel_items_limit) { let guid = match channel_item.guid { Some(ref guid) => guid.value.as_ref(), @@ -106,72 +120,62 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul None => bail!("Undefined `pub_date`"), }; if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 { + debug!("Channel item `{guid}` already exists, skipped."); continue; // skip next steps as processed } - let channel_item_id = tx.insert_channel_item( - channel_id, - pub_date, - guid, - link, - if channel_config.persist_item_title { - channel_item.title().map(|s| strip_tags(s, None)) - } else { - None - }, - if channel_config.persist_item_description { + let channel_item_id = tx.insert_channel_item(channel_id, pub_date, guid, link)?; + info!("Register new channel item #{channel_item_id} ({link})"); + if channel_config.persist_item_description { + let channel_item_description_id = tx.insert_channel_item_description( + channel_item_id, + None, + channel_item.title().map(|s| strip_tags(s, None)), channel_item .description() - .map(|s| strip_tags(s, Some(&channel_config.allowed_tags))) - } else { - None - }, - )?; - info!("Register new channel item #{channel_item_id} ({link})"); + .map(|s| strip_tags(s, Some(&channel_config.allowed_tags))), + )?; + debug!("Save channel item description #{channel_item_description_id}") + } // preload remote content.. + if !channel_config.scrape_item_content { + continue; + } + let channel_item_content_id = tx.insert_channel_item_content(channel_item_id)?; + info!("Add new content record #{channel_item_content_id}"); + let html = scraper::Html::parse_document(&get(link)?.text()?); - let description = strip_tags( - &match channel_config.content_description_selector { - Some(ref selector) => match html.select(selector).next() { - Some(description) => description.inner_html(), - None => bail!("Could not scrape `description` selector from `{link}`"), - }, - None => match channel_item.description { - Some(ref description) => description.clone(), - None => { - bail!("Could not assign `description` from channel item for `{link}`") - } - }, + let description = match channel_config.scrape_item_content_description_selector { + Some(ref selector) => match html.select(selector).next() { + Some(description) => Some(strip_tags( + &description.inner_html(), + Some(&channel_config.allowed_tags), + )), + None => bail!("Could not scrape `description` selector from `{link}`"), }, - Some(&channel_config.allowed_tags), - ); - let content_id = tx.insert_content( - channel_item_id, + None => None, + }; + let channel_item_content_description_id = tx.insert_channel_item_content_description( + channel_item_content_id, None, - strip_tags( - &match channel_config.content_title_selector { - Some(ref selector) => match html.select(selector).next() { - Some(title) => title.inner_html(), - None => bail!("Could not scrape `title` selector from `{link}`"), - }, - None => match channel_item.title { - Some(ref title) => title.clone(), - None => { - bail!( - "Could not assign `title` from channel item for content in `{link}`" - ) - } - }, + match channel_config.scrape_item_content_title_selector { + Some(ref selector) => match html.select(selector).next() { + Some(title) => Some(strip_tags(&title.inner_html(), None)), + None => bail!("Could not scrape `title` selector from `{link}`"), }, - None, - ) - .trim(), - description.trim(), + None => None, + } + .as_ref() + .map(|s| s.trim()), + description.as_ref().map(|s| s.trim()), )?; - info!("Add new content record #{content_id}"); + debug!("Save channel item content description #{channel_item_content_description_id}"); // persist images if enabled if let Some(ref selector) = channel_config.persist_images_selector { use sha2::{Digest, Sha256}; - for element in scraper::Html::parse_document(&description).select(selector) { + if description.is_none() { + bail!("Field `description` is required to scrape images from `{link}`") + } + for element in scraper::Html::parse_document(&description.unwrap()).select(selector) { if let Some(src) = element.value().attr("src") { let absolute = match Url::parse(src) { Ok(url) => url, @@ -197,10 +201,15 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul image_id } }; - let content_image_id = tx.insert_content_image(content_id, image_id)?; + let content_image_id = + tx.insert_content_image(channel_item_content_id, image_id)?; debug!("Add content image relationship #{content_image_id}"); let uri = format!("/image/{image_id}"); - tx.replace_content_description(content_id, src, &uri)?; + tx.replace_channel_item_content_description( + channel_item_content_id, + src, + &uri, + )?; debug!("Replace content image in description from `{src}` to `{uri}`") } } diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 978387b..70f8a50 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -31,7 +31,7 @@ fn index( #[derive(Serialize)] #[serde(crate = "rocket::serde")] struct Row { - content_id: u64, + channel_item_content_description_id: u64, link: String, time: String, title: String, @@ -41,7 +41,7 @@ fn index( Status::InternalServerError })?; let total = conn - .contents_total_by_provider_id(global.provider_id, search) + .channel_item_content_descriptions_total_by_provider_id(global.provider_id, search) .map_err(|e| { error!("Could not get contents total: `{e}`"); Status::InternalServerError @@ -73,7 +73,7 @@ fn index( back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))), next: if page.unwrap_or(1) * global.list_limit >= total { None } else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) }, - rows: conn.contents_by_provider_id( + rows: conn.channel_item_content_descriptions_by_provider_id( global.provider_id, search, Sort::Desc, @@ -84,13 +84,16 @@ fn index( Status::InternalServerError })? .into_iter() - .map(|content| { - let channel_item = conn.channel_item(content.channel_item_id).unwrap().unwrap(); + .map(|channel_item_content_description| { + let channel_item = conn.channel_item( + channel_item_content_description.channel_item_content_id + ).unwrap().unwrap(); Row { - content_id: content.content_id, + channel_item_content_description_id: + channel_item_content_description.channel_item_content_description_id, link: channel_item.link, time: time(channel_item.pub_date).format(&global.format_time).to_string(), - title: content.title, + title: channel_item_content_description.title.unwrap_or_default(), // @TODO handle } }) .collect::>(), @@ -102,9 +105,9 @@ fn index( )) } -#[get("/")] +#[get("/")] fn info( - content_id: u64, + channel_item_content_description_id: u64, db: &State, meta: &State, global: &State, @@ -113,29 +116,52 @@ fn info( error!("Could not connect database: `{e}`"); Status::InternalServerError })?; - match conn.content(content_id).map_err(|e| { - error!("Could not get content `{content_id}`: `{e}`"); + match conn.channel_item_content_description(channel_item_content_description_id).map_err(|e| { + error!("Could not get `channel_item_content_description_id` {channel_item_content_description_id}: `{e}`"); Status::InternalServerError })? { - Some(content) => { - let channel_item = conn - .channel_item(content.channel_item_id) + Some(channel_item_content_description) => { + let channel_item_content = conn + .channel_item_content(channel_item_content_description.channel_item_content_id) .map_err(|e| { - error!("Could not get requested channel item: `{e}`"); + error!( + "Could not get requested `channel_item_content` #{}: `{e}`", + channel_item_content_description.channel_item_content_id + ); Status::InternalServerError })? .ok_or_else(|| { - error!("Could not find requested channel item"); + error!( + "Could not find requested `channel_item_content` #{}", + channel_item_content_description.channel_item_content_id + ); Status::NotFound })?; + let channel_item = conn + .channel_item(channel_item_content.channel_item_id) + .map_err(|e| { + error!( + "Could not get requested `channel_item` #{}: `{e}`", + channel_item_content.channel_item_id + ); + Status::InternalServerError + })? + .ok_or_else(|| { + error!( + "Could not find requested `channel_item` #{}", + channel_item_content.channel_item_id + ); + Status::NotFound + })?; + let title = channel_item_content_description.title.unwrap_or_default(); // @TODO handle Ok(Template::render( "info", context! { - description: content.description, + description: channel_item_content_description.description, link: channel_item.link, meta: meta.inner(), - title: format!("{}{S}{}", content.title, meta.title), - name: content.title, + title: format!("{title}{S}{}", meta.title), + name: title, time: time(channel_item.pub_date).format(&global.format_time).to_string(), }, )) @@ -175,8 +201,8 @@ fn rss( error!("Could not connect database: `{e}`"); Status::InternalServerError })?; - for content in conn - .contents_by_provider_id( + for channel_item_content_description in conn + .channel_item_content_descriptions_by_provider_id( global.provider_id, search, Sort::Desc, @@ -184,26 +210,53 @@ fn rss( Some(global.list_limit), ) .map_err(|e| { - error!("Could not load channel item contents: `{e}`"); + error!( + "Could not load `channel_item_content_description` for `provider` #{:?}: `{e}`", + global.provider_id + ); Status::InternalServerError })? { - let channel_item = conn - .channel_item(content.channel_item_id) + let channel_item_content = conn + .channel_item_content(channel_item_content_description.channel_item_content_id) .map_err(|e| { - error!("Could not get requested channel item: `{e}`"); + error!( + "Could not get requested `channel_item_content` #{}: `{e}`", + channel_item_content_description.channel_item_content_id + ); Status::InternalServerError })? .ok_or_else(|| { - error!("Could not find requested channel item"); + error!( + "Could not find requested `channel_item_content` #{}", + channel_item_content_description.channel_item_content_id + ); + Status::NotFound + })?; + let channel_item = conn + .channel_item(channel_item_content.channel_item_id) + .map_err(|e| { + error!( + "Could not get requested `channel_item` #{}: `{e}`", + channel_item_content.channel_item_id + ); + Status::InternalServerError + })? + .ok_or_else(|| { + error!( + "Could not find requested `channel_item` #{}", + channel_item_content.channel_item_id + ); Status::NotFound })?; feed.push( - content.channel_item_id, + channel_item_content_description.channel_item_content_description_id, time(channel_item.pub_date), channel_item.link, - content.title, - content.description, + channel_item_content_description.title.unwrap_or_default(), // @TODO handle + channel_item_content_description + .description + .unwrap_or_default(), // @TODO handle ) } Ok(RawXml(feed.commit())) diff --git a/crates/http/templates/index.html.tera b/crates/http/templates/index.html.tera index 65082f9..4cef190 100644 --- a/crates/http/templates/index.html.tera +++ b/crates/http/templates/index.html.tera @@ -3,13 +3,15 @@ {% if rows %} {% for row in rows %}
- -

{{ row.title }}

+ +

{{ row.title }}

{{ row.time }}

{% endfor %} {% else %} -
Nothing.
+
+

Nothing.

+
{% endif %} {% if next %}Next{% endif %} {% if back %}Back{% endif %} diff --git a/crates/llm/src/main.rs b/crates/llm/src/main.rs index 188902f..7184303 100644 --- a/crates/llm/src/main.rs +++ b/crates/llm/src/main.rs @@ -67,35 +67,50 @@ async fn main() -> Result<()> { loop { debug!("New queue begin..."); let mut tx = db.transaction()?; - for source in tx.contents_queue_for_provider_id(provider_id)? { + for channel_item_content_description in + tx.channel_item_content_descriptions_queue_for_provider_id(provider_id)? + { debug!( - "Begin generating `content_id` #{} using `provider_id` #{provider_id}.", - source.content_id + "Begin generating `channel_item_content_description` #{} using `provider_id` #{provider_id}.", + channel_item_content_description.channel_item_content_description_id ); - - let title = llm - .chat_completion(ChatCompletionRequest::new(&config.llm.model).message( - Message::user(format!("{}\n{}", config.llm.message, source.title)), - )) - .await?; - - let description = llm - .chat_completion(ChatCompletionRequest::new(&config.llm.model).message( - Message::user(format!("{}\n{}", config.llm.message, source.description)), - )) - .await?; - - let content_id = tx.insert_content( - source.channel_item_id, + let title = match channel_item_content_description.title { + Some(subject) => Some( + llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message( + Message::user(format!("{}\n{}", config.llm.message, subject)), + )) + .await? + .choices[0] + .message + .content + .trim() + .to_string(), + ), + None => None, + }; + let description = match channel_item_content_description.description { + Some(subject) => Some( + llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message( + Message::user(format!("{}\n{}", config.llm.message, subject)), + )) + .await? + .choices[0] + .message + .content + .trim() + .to_string(), + ), + None => None, + }; + let channel_item_content_description_id = tx.insert_channel_item_content_description( + channel_item_content_description.channel_item_content_id, Some(provider_id), - &title.choices[0].message.content, - &description.choices[0].message.content, + title.as_deref(), + description.as_deref(), )?; - - debug!( - "Created `content_id` #{content_id} using `content_id` #{} source by `provider_id` #{provider_id}.", - source.content_id - ) + info!( + "Create `channel_item_content_description` #{channel_item_content_description_id} by `provider_id` #{provider_id}." + ); } tx.commit()?; debug!("Queue completed"); diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql index 9524e12..2f1e5f6 100644 --- a/crates/mysql/database/0.1.0.sql +++ b/crates/mysql/database/0.1.0.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- сб, 10-січ-2026 14:27:50 +0200 +-- нд, 11-січ-2026 20:33:40 +0200 -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -21,7 +21,7 @@ USE `rssto` ; -- Table `rssto`.`channel` -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`channel` ( - `channel_id` INT NOT NULL AUTO_INCREMENT, + `channel_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, `url` VARCHAR(255) NOT NULL, PRIMARY KEY (`channel_id`), UNIQUE INDEX `url_UNIQUE` (`url` ASC) VISIBLE) @@ -32,14 +32,12 @@ ENGINE = InnoDB; -- Table `rssto`.`channel_item` -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` ( - `channel_item_id` INT NOT NULL AUTO_INCREMENT, - `channel_id` INT NOT NULL, + `channel_item_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_id` INT UNSIGNED NOT NULL, `pub_date` BIGINT NOT NULL, `guid` VARCHAR(255) NOT NULL, `link` VARCHAR(255) NOT NULL, - `title` VARCHAR(255) NULL, - `description` LONGTEXT NULL, - PRIMARY KEY (`channel_item_id`), + PRIMARY KEY (`channel_item_id`, `channel_id`), INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE, UNIQUE INDEX `UNIQUE` (`guid` ASC, `channel_id` ASC) VISIBLE, CONSTRAINT `fk_channel_item_channel` @@ -54,7 +52,7 @@ ENGINE = InnoDB; -- Table `rssto`.`provider` -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`provider` ( - `provider_id` INT NOT NULL AUTO_INCREMENT, + `provider_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, `name` VARCHAR(255) NOT NULL, PRIMARY KEY (`provider_id`), UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE) @@ -62,27 +60,17 @@ ENGINE = InnoDB; -- ----------------------------------------------------- --- Table `rssto`.`content` +-- Table `rssto`.`channel_item_content` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `rssto`.`content` ( - `content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `channel_item_id` INT NOT NULL, - `provider_id` INT NULL, - `title` VARCHAR(255) NOT NULL, - `description` LONGTEXT NOT NULL, - PRIMARY KEY (`content_id`), - INDEX `fk_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE, - INDEX `fk_content_provider_idx` (`provider_id` ASC) VISIBLE, - UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE, - CONSTRAINT `fk_content_channel_item` +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content` ( + `channel_item_content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_id` INT UNSIGNED NOT NULL, + PRIMARY KEY (`channel_item_content_id`, `channel_item_id`), + INDEX `fk_channel_item_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_content_channel_item` FOREIGN KEY (`channel_item_id`) REFERENCES `rssto`.`channel_item` (`channel_item_id`) ON DELETE NO ACTION - ON UPDATE NO ACTION, - CONSTRAINT `fk_content_provider` - FOREIGN KEY (`provider_id`) - REFERENCES `rssto`.`provider` (`provider_id`) - ON DELETE NO ACTION ON UPDATE NO ACTION) ENGINE = InnoDB; @@ -92,31 +80,38 @@ ENGINE = InnoDB; -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`image` ( `image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `provider_id` INT UNSIGNED NULL, `sha256` CHAR(64) NOT NULL, `src` VARCHAR(2048) NULL, `url` VARCHAR(2048) NULL, `data` MEDIUMBLOB NOT NULL, PRIMARY KEY (`image_id`), - UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE) + UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE, + INDEX `fk_image_provider_idx` (`provider_id` ASC) VISIBLE, + CONSTRAINT `fk_image_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) ENGINE = InnoDB; -- ----------------------------------------------------- --- Table `rssto`.`content_image` +-- Table `rssto`.`channel_item_content_image` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `rssto`.`content_image` ( - `content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `content_id` BIGINT UNSIGNED NOT NULL, +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_image` ( + `channel_item_content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `content_channel_item_content_id` BIGINT UNSIGNED NOT NULL, `image_id` BIGINT UNSIGNED NOT NULL, - PRIMARY KEY (`content_image_id`), - INDEX `fk_content_image_content_idx` (`content_id` ASC) VISIBLE, - INDEX `fk_content_image_image_idx` (`image_id` ASC) VISIBLE, - CONSTRAINT `fk_content_image_content` - FOREIGN KEY (`content_id`) - REFERENCES `rssto`.`content` (`content_id`) + PRIMARY KEY (`channel_item_content_image_id`), + INDEX `fk_channel_item_content_image_channel_item_content_idx` (`content_channel_item_content_id` ASC) VISIBLE, + INDEX `fk_channel_item_content_image_image_idx` (`image_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_content_image_channel_item_content` + FOREIGN KEY (`content_channel_item_content_id`) + REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`) ON DELETE NO ACTION ON UPDATE NO ACTION, - CONSTRAINT `fk_content_image_image` + CONSTRAINT `fk_channel_item_content_image_image` FOREIGN KEY (`image_id`) REFERENCES `rssto`.`image` (`image_id`) ON DELETE NO ACTION @@ -124,6 +119,84 @@ CREATE TABLE IF NOT EXISTS `rssto`.`content_image` ( ENGINE = InnoDB; +-- ----------------------------------------------------- +-- Table `rssto`.`channel_description` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_description` ( + `channel_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_id` INT UNSIGNED NOT NULL, + `provider_id` INT UNSIGNED NULL, + `title` TEXT NULL, + `description` LONGTEXT NULL, + PRIMARY KEY (`channel_description_id`), + INDEX `fk_channel_description_provider_idx` (`provider_id` ASC) VISIBLE, + INDEX `fk_channel_description_channel_idx` (`channel_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`channel_id` ASC, `provider_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_description_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_description_channel` + FOREIGN KEY (`channel_id`) + REFERENCES `rssto`.`channel` (`channel_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item_description` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_description` ( + `channel_item_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_id` INT UNSIGNED NOT NULL, + `provider_id` INT UNSIGNED NULL, + `title` TEXT NULL, + `description` LONGTEXT NULL, + INDEX `fk_channel_item_description_channel_item_idx` (`channel_item_id` ASC) VISIBLE, + INDEX `fk_channel_item_description_provider_idx` (`provider_id` ASC) VISIBLE, + PRIMARY KEY (`channel_item_description_id`), + UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_description_channel_item` + FOREIGN KEY (`channel_item_id`) + REFERENCES `rssto`.`channel_item` (`channel_item_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_item_description_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item_content_description` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_description` ( + `channel_item_content_description_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_content_id` BIGINT UNSIGNED NOT NULL, + `provider_id` INT UNSIGNED NULL, + `title` TEXT NULL, + `description` LONGTEXT NULL, + PRIMARY KEY (`channel_item_content_description_id`), + INDEX `fk_channel_item_content_description_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE, + INDEX `fk_channel_item_content_description_provider_idx` (`provider_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`channel_item_content_id` ASC, `provider_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_content_description_channel_item_content` + FOREIGN KEY (`channel_item_content_id`) + REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_item_content_description_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + SET SQL_MODE=@OLD_SQL_MODE; SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; diff --git a/crates/mysql/src/connection.rs b/crates/mysql/src/connection.rs index d9617d1..22ce0cf 100644 --- a/crates/mysql/src/connection.rs +++ b/crates/mysql/src/connection.rs @@ -19,54 +19,99 @@ impl Connection { `channel_id`, `pub_date`, `guid`, - `link`, - `title`, - `description` FROM `channel_item` WHERE `channel_item_id` = ?", + `link` FROM `channel_item` WHERE `channel_item_id` = ?", (channel_item_id,), ) } - pub fn content(&mut self, content_id: u64) -> Result, Error> { + pub fn channel_item_content( + &mut self, + channel_item_content_id: u64, + ) -> Result, Error> { self.conn.exec_first( - "SELECT `content_id`, - `channel_item_id`, - `provider_id`, - `title`, - `description` FROM `content` WHERE `content_id` = ?", - (content_id,), + "SELECT `channel_item_content_id`, + `channel_item_id` + FROM `channel_item_content` WHERE `channel_item_content_id` = ?", + (channel_item_content_id,), ) } - pub fn contents_total_by_provider_id( + pub fn channel_item_content_description( + &mut self, + channel_item_content_description_id: u64, + ) -> Result, Error> { + self.conn.exec_first( + "SELECT `channel_item_content_description_id`, + `channel_item_content_id`, + `provider_id`, + `title`, + `description` FROM `channel_item_content_description` + WHERE `channel_item_content_description_id` = ?", + (channel_item_content_description_id,), + ) + } + + pub fn channel_item_content_descriptions_total_by_provider_id( &mut self, provider_id: Option, keyword: Option<&str>, ) -> Result { - let total: Option = self.conn.exec_first( - "SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?", - (provider_id, like(keyword)), - )?; + let total: Option = match keyword { + Some(k) => self.conn.exec_first( + "SELECT COUNT(*) FROM `channel_item_content_description` + WHERE `provider_id` <=> ? AND `title` LIKE '%?%'", + (provider_id, k), + )?, + None => self.conn.exec_first( + "SELECT COUNT(*) FROM `channel_item_content_description` + WHERE `provider_id` <=> ?", + (provider_id,), + )?, + }; + Ok(total.unwrap_or(0)) } - pub fn contents_by_provider_id( + pub fn channel_item_content_descriptions_by_provider_id( &mut self, provider_id: Option, keyword: Option<&str>, sort: Sort, start: Option, limit: Option, - ) -> Result, Error> { - self.conn.exec(format!( - "SELECT `content_id`, - `channel_item_id`, - `provider_id`, - `title`, - `description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {},{}", - start.unwrap_or(0), - limit.unwrap_or(DEFAULT_LIMIT) - ), - (provider_id, like(keyword), )) + ) -> Result, Error> { + match keyword { + Some(k) => self.conn.exec( + format!( + "SELECT `channel_item_content_description_id`, + `channel_item_content_id`, + `provider_id`, + `title`, + `description` + FROM `channel_item_content_description` + WHERE `provider_id` <=> ? AND `title` LIKE '%?%' + ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}", + start.unwrap_or(0), + limit.unwrap_or(DEFAULT_LIMIT) + ), + (provider_id, k), + ), + None => self.conn.exec( + format!( + "SELECT `channel_item_content_description_id`, + `channel_item_content_id`, + `provider_id`, + `title`, + `description` + FROM `channel_item_content_description` + WHERE `provider_id` <=> ? + ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}", + start.unwrap_or(0), + limit.unwrap_or(DEFAULT_LIMIT) + ), + (provider_id,), + ), + } } pub fn content_image(&mut self, content_image_id: u64) -> Result, Error> { @@ -107,9 +152,4 @@ impl Connection { } } -/// Shared search logic -fn like(value: Option<&str>) -> String { - value.map_or("%".into(), |k| format!("{k}%")) -} - const DEFAULT_LIMIT: usize = 100; diff --git a/crates/mysql/src/table.rs b/crates/mysql/src/table.rs index 631bc37..867abb8 100644 --- a/crates/mysql/src/table.rs +++ b/crates/mysql/src/table.rs @@ -13,19 +13,30 @@ pub struct ChannelItem { pub pub_date: i64, pub guid: String, pub link: String, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ChannelItemDescription { + pub channel_item_description_id: u64, + pub channel_item_id: u64, + pub provider_id: Option, pub title: Option, pub description: Option, } #[derive(Debug, PartialEq, Eq, FromRow)] -pub struct Content { - pub content_id: u64, +pub struct ChannelItemContent { + pub channel_item_content_id: u64, pub channel_item_id: u64, - /// None if the original `title` and `description` values - /// parsed from the channel item on crawl +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ChannelItemContentDescription { + pub channel_item_content_description_id: u64, + pub channel_item_content_id: u64, pub provider_id: Option, - pub title: String, - pub description: String, + pub title: Option, + pub description: Option, } #[derive(Debug, PartialEq, Eq, FromRow)] @@ -37,6 +48,7 @@ pub struct Provider { #[derive(Debug, PartialEq, Eq, FromRow)] pub struct Image { pub image_id: u64, + pub provider_id: Option, /// Keep image unique by comparing its data hash pub sha256: String, /// Original `src` tag value to post-replacing diff --git a/crates/mysql/src/transaction.rs b/crates/mysql/src/transaction.rs index c2a2077..970aaef 100644 --- a/crates/mysql/src/transaction.rs +++ b/crates/mysql/src/transaction.rs @@ -36,6 +36,23 @@ impl Transaction { Ok(self.tx.last_insert_id().unwrap()) } + pub fn insert_channel_description( + &mut self, + channel_id: u64, + provider_id: Option, + title: Option, + description: Option, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_description` SET `channel_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", + (channel_id, provider_id, title, description), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + pub fn channel_items_total_by_channel_id_guid( &mut self, channel_id: u64, @@ -56,66 +73,88 @@ impl Transaction { pub_date: i64, guid: &str, link: &str, - title: Option, - description: Option, ) -> Result { self.tx.exec_drop( "INSERT INTO `channel_item` SET `channel_id` = ?, `pub_date` = ?, `guid` = ?, - `link` = ?, - `title` = ?, - `description` = ?", - (channel_id, pub_date, guid, link, title, description), + `link` = ?", + (channel_id, pub_date, guid, link), )?; Ok(self.tx.last_insert_id().unwrap()) } - pub fn contents_queue_for_provider_id( - &mut self, - provider_id: u64, - ) -> Result, Error> { - self.tx.exec( - "SELECT `c1`.`content_id`, - `c1`.`channel_item_id`, - `c1`.`provider_id`, - `c1`.`title`, - `c1`.`description` - FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS ( - SELECT NULL FROM `content` AS `c2` - WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id` - AND `c2`.`provider_id` = ? LIMIT 1 - )", - (provider_id,), - ) - } - - pub fn insert_content( + pub fn insert_channel_item_description( &mut self, channel_item_id: u64, provider_id: Option, - title: &str, - description: &str, + title: Option, + description: Option, ) -> Result { self.tx.exec_drop( - "INSERT INTO `content` SET `channel_item_id` = ?, - `provider_id` = ?, - `title` = ?, - `description` = ?", + "INSERT INTO `channel_item_description` SET `channel_item_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", (channel_item_id, provider_id, title, description), )?; Ok(self.tx.last_insert_id().unwrap()) } - pub fn replace_content_description( + pub fn channel_item_content_descriptions_queue_for_provider_id( + &mut self, + provider_id: u64, + ) -> Result, Error> { + self.tx.exec( + "SELECT `t1`.`content_id`, + `t1`.`channel_item_id`, + `t1`.`provider_id`, + `t1`.`title`, + `t1`.`description` + FROM `channel_item_content_description` AS `t1` + WHERE `t1`.`provider_id` IS NULL AND NOT EXISTS ( + SELECT NULL FROM `channel_item_content_description` AS `t2` + WHERE `t2`.`channel_item_id` = `t1`.`channel_item_id` + AND `t2`.`provider_id` = ? LIMIT 1 + )", + (provider_id,), + ) + } + + pub fn insert_channel_item_content(&mut self, channel_item_id: u64) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item_content` SET `channel_item_id` = ?", + (channel_item_id,), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn insert_channel_item_content_description( + &mut self, + channel_item_content_id: u64, + provider_id: Option, + title: Option<&str>, + description: Option<&str>, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item_content_description` SET `channel_item_content_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", + (channel_item_content_id, provider_id, title, description), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn replace_channel_item_content_description( &mut self, content_id: u64, from: &str, to: &str, ) -> Result<(), Error> { self.tx.exec_drop( - "UPDATE `content` SET `description` = REPLACE(`description`, ?, ?) - WHERE`content_id` = ?", + "UPDATE `channel_item_content_description` + SET `description` = REPLACE(`description`, ?, ?) WHERE`content_id` = ?", (from, to, content_id), ) }