From e070db316ced424d474cc9c675d204aacc001727 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 15:00:55 +0200 Subject: [PATCH 01/65] initial commit --- .gitignore | 1 - Cargo.toml | 30 +---- README.md | 67 +---------- config/example.toml | 19 --- crates/crawler/Cargo.toml | 23 ++++ crates/crawler/LICENSE | 21 ++++ crates/crawler/config/example.toml | 20 ++++ {src => crates/crawler/src}/argument.rs | 0 crates/crawler/src/config.rs | 33 ++++++ crates/crawler/src/main.rs | 124 ++++++++++++++++++++ crates/mysql/Cargo.toml | 13 +++ crates/mysql/LICENSE | 21 ++++ crates/mysql/src/lib.rs | 137 ++++++++++++++++++++++ src/config.rs | 32 ----- src/main.rs | 148 ------------------------ template/gmi/index.gmi | 7 -- template/gmi/index/item.gmi | 6 - template/html/index.html | 49 -------- template/html/index/item.html | 5 - 19 files changed, 400 insertions(+), 356 deletions(-) delete mode 100644 config/example.toml create mode 100644 crates/crawler/Cargo.toml create mode 100644 crates/crawler/LICENSE create mode 100644 crates/crawler/config/example.toml rename {src => crates/crawler/src}/argument.rs (100%) create mode 100644 crates/crawler/src/config.rs create mode 100644 crates/crawler/src/main.rs create mode 100644 crates/mysql/Cargo.toml create mode 100644 crates/mysql/LICENSE create mode 100644 crates/mysql/src/lib.rs delete mode 100644 src/config.rs delete mode 100644 src/main.rs delete mode 100644 template/gmi/index.gmi delete mode 100644 template/gmi/index/item.gmi delete mode 100644 template/html/index.html delete mode 100644 template/html/index/item.html diff --git a/.gitignore b/.gitignore index daab55e..869df07 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ -/public /target Cargo.lock \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index ea88211..22e1445 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,24 +1,6 @@ -[package] -name = "rssto" -version = "0.2.2" -edition = "2024" -license = "MIT" -readme = "README.md" -description = "Convert RSS feeds into multiple formats" -keywords = ["rss", "aggregator", "conversion", "html", "gemtext"] -categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"] -repository = "https://github.com/YGGverse/rssto" - -[dependencies] -anyhow = "1.0" -chrono = "^0.4.20" -clap = { version = "4.5", features = ["derive"] } -log = "0.4" -regex = "1.12" -reqwest = { version = "0.12", features = ["blocking"] } -rss = "2.0" -serde = { version = "1.0", features = ["derive"] } -strip-tags = "0.1" -toml = "0.9" -tracing-subscriber = { version = "0.3", features = ["env-filter"] } -url = "2.5" +[workspace] +resolver = "2" +members = [ + "crates/crawler", + "crates/mysql", +] \ No newline at end of file diff --git a/README.md b/README.md index 10937a0..f114474 100644 --- a/README.md +++ b/README.md @@ -6,68 +6,5 @@ Convert RSS feeds into multiple formats -## Features - -* [x] Multiple feed sources with flexible TOML config options - * [x] Limit channel items - * [x] Format time - * [x] Multiple export format definition -* [x] Custom templates -* [x] Single export or daemon mode with update time -* [x] Export formats: - * [x] HTML - * [x] [Gemtext](https://geminiprotocol.net/docs/gemtext.gmi) - -## Install - -``` bash -cargo install rssto -``` - -## Launch - -``` bash -rssto -c config/example.toml -``` -> [!TIP] -> * prepend `RUST_LOG=DEBUG` to print worker details (supported [levels](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.LevelFilter.html)) -> * append `-u TIME` to run as the daemon with `TIME` interval update -> * see `rssto --help` to print all available options - -### Systemd - -1. Install `rssto` by copy the binary compiled into the native system apps destination: - * Linux: `sudo install /home/user/.cargo/bin/rssto /usr/local/bin/rssto` -2. Create `systemd` configuration file at `/etc/systemd/system/rssto.service`: - -``` rssto.service -[Unit] -After=network-online.target -Wants=network-online.target - -[Service] -Type=simple - -User=rssto -Group=rssto - -# Uncomment for debug -# Environment="RUST_LOG=DEBUG" -# Environment="NO_COLOR=1" - -ExecStart=/usr/local/bin/rssto -c /path/to/config.toml - -StandardOutput=file:///home/rssto/debug.log -StandardError=file:///home/rssto/error.log - -[Install] -WantedBy=multi-user.target -``` -* example above requires new system user (`useradd -m rssto`) - -3. Run in priority: - - * `systemctl daemon-reload` - reload systemd configuration - * `systemctl enable rssto` - enable new service - * `systemctl start rssto` - start the process - * `systemctl status rssto` - check process launched +> [!NOTE] +> Branch in development! \ No newline at end of file diff --git a/config/example.toml b/config/example.toml deleted file mode 100644 index 6793e42..0000000 --- a/config/example.toml +++ /dev/null @@ -1,19 +0,0 @@ -update = 60 - -[[feed]] -url = "https://assets.censor.net/rss/censor.net/rss_uk_news.xml" -storage = "./public/censor.net/rss_uk_news" -templates = ["./template/html","./template/gmi"] -list_items_limit = 20 -pub_date_format = "%Y/%m/%d %H:%M:%S %z" -last_build_date_format = "%Y/%m/%d %H:%M:%S %z" -time_generated_format = "%Y/%m/%d %H:%M:%S %z" - -[[feed]] -url = "https://assets.censor.net/rss/censor.net/rss_uk_resonance.xml" -storage = "./public/censor.net/rss_uk_resonance" -templates = ["./template/html","./template/gmi"] -list_items_limit = 20 -pub_date_format = "%Y/%m/%d %H:%M:%S %z" -last_build_date_format = "%Y/%m/%d %H:%M:%S %z" -time_generated_format = "%Y/%m/%d %H:%M:%S %z" \ No newline at end of file diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml new file mode 100644 index 0000000..148545a --- /dev/null +++ b/crates/crawler/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "rssto-crawler" +version = "0.1.0" +edition = "2024" +license = "MIT" +readme = "README.md" +description = "Crawl RSS feeds into MySQL database" +keywords = ["rss", "aggregator", "conversion", "mysql", "crawler"] +categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"] +repository = "https://github.com/YGGverse/rssto" + +[dependencies] +anyhow = "1.0.100" +chrono = "0.4.42" +clap = { version = "4.5.54", features = ["derive"] } +log = "0.4.29" +mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" } +reqwest = { version = "0.13.1", features = ["blocking"] } +rss = "2.0.12" +serde = { version = "1.0.228", features = ["derive"] } +toml = "0.9.10" +tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } +url = { version = "2.5.8", features = ["serde"] } diff --git a/crates/crawler/LICENSE b/crates/crawler/LICENSE new file mode 100644 index 0000000..a9c0006 --- /dev/null +++ b/crates/crawler/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 YGGverse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/crawler/config/example.toml b/crates/crawler/config/example.toml new file mode 100644 index 0000000..a5b62dc --- /dev/null +++ b/crates/crawler/config/example.toml @@ -0,0 +1,20 @@ +update = 60 + +[mysql] +host = "localhost" +port = 3307 +user = "" +password = "" +database = "rssto" + +[[channel]] +url = "https://" +items_limit = 20 +persist_item_title = true +persist_item_description = true + +[[channel]] +url = "https://" +items_limit = 20 +persist_item_title = true +persist_item_description = true \ No newline at end of file diff --git a/src/argument.rs b/crates/crawler/src/argument.rs similarity index 100% rename from src/argument.rs rename to crates/crawler/src/argument.rs diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs new file mode 100644 index 0000000..e9ac4ac --- /dev/null +++ b/crates/crawler/src/config.rs @@ -0,0 +1,33 @@ +use serde::Deserialize; +use url::Url; + +#[derive(Debug, Deserialize)] +pub struct Mysql { + pub database: String, + pub host: String, + pub password: String, + pub port: u16, + pub user: String, +} + +#[derive(Debug, Deserialize)] +pub struct Channel { + /// RSS feed source + pub url: Url, + /// Limit channel items (unlimited by default) + pub items_limit: Option, + /// Save item title + pub persist_item_title: bool, + /// Save item description + pub persist_item_description: bool, +} + +#[derive(Debug, Deserialize)] +pub struct Config { + pub mysql: Mysql, + pub channel: Vec, + /// Update timeout in seconds + /// + /// * None to generate once + pub update: Option, +} diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs new file mode 100644 index 0000000..01aa48e --- /dev/null +++ b/crates/crawler/src/main.rs @@ -0,0 +1,124 @@ +mod argument; +mod config; + +use anyhow::Result; +use log::{debug, info, warn}; +use mysql::Mysql; + +fn main() -> Result<()> { + use argument::Argument; + use chrono::Local; + use clap::Parser; + use std::{env::var, fs::read_to_string}; + + if var("RUST_LOG").is_ok() { + use tracing_subscriber::{EnvFilter, fmt::*}; + struct T; + impl time::FormatTime for T { + fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result { + write!(w, "{}", Local::now()) + } + } + fmt() + .with_timer(T) + .with_env_filter(EnvFilter::from_default_env()) + .init() + } + + let argument = Argument::parse(); + let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?; + + let mut database = Mysql::connect( + &config.mysql.host, + config.mysql.port, + &config.mysql.user, + &config.mysql.password, + &config.mysql.database, + )?; + + info!("Crawler started"); + loop { + debug!("Begin new crawl queue..."); + for feed in &config.channel { + debug!("Update `{}`...", feed.url); + if let Err(e) = crawl(&mut database, feed) { + warn!("Feed `{}` update failed: `{e}`", feed.url) + } + } + debug!("Crawl queue completed"); + if let Some(update) = config.update { + debug!("Wait {update} seconds to continue...",); + std::thread::sleep(std::time::Duration::from_secs(update)) + } else { + return Ok(()); + } + } +} + +fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { + use reqwest::blocking::get; + use rss::Channel; + + let channel = Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..])?; + let channel_items = channel.items(); + let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); + + let feed_url = channel_config.url.to_string(); + let channel_id = match db.channels_by_url(&feed_url, Some(1))?.first() { + Some(result) => result.channel_id, + None => db.insert_channel(&feed_url)?, + }; + + for channel_item in channel_items.iter().take(channel_items_limit) { + let guid = match channel_item.guid { + Some(ref guid) => guid.value.clone(), + None => { + warn!("Undefined `guid` field in `{feed_url}`"); + continue; + } + }; + let link = match channel_item.guid { + Some(ref link) => link.value.clone(), + None => { + warn!("Undefined `link` field in `{feed_url}`"); + continue; + } + }; + let pub_date = match channel_item.pub_date { + Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) { + Ok(t) => t.timestamp(), + Err(e) => { + warn!("Invalid `pub_date` field in `{feed_url}`: `{e}`"); + continue; + } + }, + None => { + warn!("Undefined `pub_date` field in `{feed_url}`"); + continue; + } + }; + let channel_item_id = match db + .channel_items_by_channel_id_guid(channel_id, &guid, Some(1))? + .first() + { + Some(result) => result.channel_item_id, + None => db.insert_channel_item( + channel_id, + pub_date, + &guid, + &link, + if channel_config.persist_item_title { + channel_item.title() + } else { + None + }, + if channel_config.persist_item_description { + channel_item.description() + } else { + None + }, + )?, + }; // @TODO + } + Ok(()) +} diff --git a/crates/mysql/Cargo.toml b/crates/mysql/Cargo.toml new file mode 100644 index 0000000..ddce0cc --- /dev/null +++ b/crates/mysql/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "rssto-mysql" +version = "0.1.0" +edition = "2024" +license = "MIT" +readme = "README.md" +description = "Shared MySQL database library" +keywords = ["rssto", "database", "mysql", "library", "driver", "api"] +# categories = [] +repository = "https://github.com/YGGverse/rssto" + +[dependencies] +mysql = "26.0.1" diff --git a/crates/mysql/LICENSE b/crates/mysql/LICENSE new file mode 100644 index 0000000..a9c0006 --- /dev/null +++ b/crates/mysql/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 YGGverse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs new file mode 100644 index 0000000..7286a34 --- /dev/null +++ b/crates/mysql/src/lib.rs @@ -0,0 +1,137 @@ +use mysql::{Error, PooledConn, prelude::Queryable}; + +pub struct Mysql { + connection: PooledConn, +} + +impl Mysql { + pub fn connect( + host: &str, + port: u16, + user: &str, + password: &str, + database: &str, + ) -> Result { + Ok(Self { + connection: mysql::Pool::new( + format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(), + )? + .get_conn()?, + }) + } + + pub fn channels_by_url( + &mut self, + url: &str, + limit: Option, + ) -> Result, Error> { + self.connection.exec_map( + format!( + "SELECT `channel_id`, `url` FROM `channel` WHERE `url` = ? LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) + ), + (url,), + |(channel_id, url)| Channel { channel_id, url }, + ) + } + + pub fn insert_channel(&mut self, url: &str) -> Result { + self.connection + .exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?; + + Ok(self.connection.last_insert_id()) + } + + pub fn channel_items_by_channel_id_guid( + &mut self, + channel_id: u64, + guid: &str, + limit: Option, + ) -> Result, Error> { + self.connection.exec_map( + format!( + "SELECT `channel_item_id`, `channel_id`, `guid`, `link`, `title`, `description` FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ? LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT)), + ( + channel_id, + guid + ), + |( + channel_item_id, + channel_id, + pub_date, + guid, + link, + title, + description + )| + ChannelItem { + channel_item_id, + channel_id, + pub_date, + guid, + link, + title, + description + }, + ) + } + + pub fn insert_channel_item( + &mut self, + channel_id: u64, + pub_date: i64, + guid: &str, + link: &str, + title: Option<&str>, + description: Option<&str>, + ) -> Result { + self.connection.exec_drop( + "INSERT INTO `channel_item` SET `channel_id` = ?, `pub_date` = ?, `guid` = ?, `link` = ?, `title` = ?, `description` = ?", + (channel_id, pub_date, guid, link, title, description), + )?; + Ok(self.connection.last_insert_id()) + } + + pub fn insert_content( + &mut self, + channel_item_id: u64, + source_id: Option, + title: &str, + description: &str, + ) -> Result<(), Error> { + self.connection.exec_drop( + "INSERT INTO `content` SET `channel_item_id` = ?, `source_id` = ?, `title` = ?, `description` = ?", + (channel_item_id, source_id, title, description ), + ) + } +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Channel { + pub channel_id: u64, + pub url: String, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct ChannelItem { + pub channel_item_id: u64, + pub channel_id: u64, + pub pub_date: i32, + pub guid: String, + pub link: String, + pub title: Option, + pub description: Option, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Content { + pub channel_item_id: u64, + /// None if the original `title` and `description` values + /// parsed from the channel item on crawl + pub source_id: Option, + pub title: String, + pub description: String, +} + +const DEFAULT_LIMIT: usize = 100; diff --git a/src/config.rs b/src/config.rs deleted file mode 100644 index 2fb41ec..0000000 --- a/src/config.rs +++ /dev/null @@ -1,32 +0,0 @@ -use serde::Deserialize; -use std::path::PathBuf; -use url::Url; - -#[derive(Debug, Deserialize)] -pub struct Feed { - /// RSS feed source - pub url: Url, - - /// Destination directory - pub storage: PathBuf, - - /// Path to templates (export formats) - pub templates: Vec, - - /// Limit channel items (unlimited by default) - pub list_items_limit: Option, - - pub pub_date_format: String, - pub last_build_date_format: String, - pub time_generated_format: String, -} - -#[derive(Debug, Deserialize)] -pub struct Config { - pub feed: Vec, - - /// Update timeout in seconds - /// - /// * None to generate once - pub update: Option, -} diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 8aae376..0000000 --- a/src/main.rs +++ /dev/null @@ -1,148 +0,0 @@ -mod argument; -mod config; - -use anyhow::Result; -use argument::Argument; -use chrono::{DateTime, Local}; -use clap::Parser; -use config::{Config, Feed}; -use log::{debug, info, warn}; -use std::{ - env::var, - fs::{File, create_dir_all, read_to_string}, - io::Write, - path::PathBuf, -}; -use strip_tags::*; - -fn main() -> Result<()> { - if var("RUST_LOG").is_ok() { - use tracing_subscriber::{EnvFilter, fmt::*}; - struct T; - impl time::FormatTime for T { - fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result { - write!(w, "{}", Local::now()) - } - } - fmt() - .with_timer(T) - .with_env_filter(EnvFilter::from_default_env()) - .init() - } - - let argument = Argument::parse(); - let config: Config = toml::from_str(&read_to_string(argument.config)?)?; - - info!("Crawler started"); - - loop { - debug!("Begin new crawl queue..."); - - for feed in &config.feed { - debug!("Update `{}`...", feed.url); - if let Err(e) = crawl(feed) { - warn!("Feed `{}` update failed: `{e}`", feed.url) - } - } - - debug!("Crawl queue completed"); - - if let Some(update) = config.update { - debug!("Wait {update} seconds to continue...",); - std::thread::sleep(std::time::Duration::from_secs(update)) - } else { - return Ok(()); - } - } -} - -fn crawl(feed: &Feed) -> Result<()> { - use reqwest::blocking::get; - use rss::Channel; - - let channel = Channel::read_from(&get(feed.url.as_str())?.bytes()?[..])?; - let channel_items = channel.items(); - let channel_items_limit = feed.list_items_limit.unwrap_or(channel_items.len()); - let regex = regex::Regex::new(r"\n{3,}").unwrap(); - - for template in &feed.templates { - let root = PathBuf::from(template); - let extension = root.file_name().unwrap().to_string_lossy(); - - let index = { - let mut p = PathBuf::from(&root); - p.push(format!("index.{extension}")); - read_to_string(p)? - }; - - let index_item = { - let mut p = PathBuf::from(&root); - p.push("index"); - p.push(format!("item.{extension}")); - read_to_string(p)? - }; - - create_dir_all(&feed.storage)?; - File::create({ - let mut p = PathBuf::from(&feed.storage); - p.push(format!("index.{extension}")); - p - })? - .write_all( - index - .replace("{title}", &strip_tags(channel.title())) - .replace("{description}", &strip_tags(channel.description())) - .replace("{link}", channel.link()) - .replace("{language}", channel.language().unwrap_or_default()) - .replace( - "{pub_date}", - &time(channel.pub_date(), &feed.pub_date_format), - ) - .replace( - "{last_build_date}", - &time(channel.last_build_date(), &feed.last_build_date_format), - ) - .replace("{time_generated}", &time(None, &feed.time_generated_format)) - .replace( - "{items}", - &channel_items - .iter() - .take(channel_items_limit) - .map(|i| { - regex - .replace_all( - &index_item - .replace( - "{title}", - &strip_tags(i.title().unwrap_or_default()), - ) - .replace( - "{description}", - &strip_tags(i.description().unwrap_or_default()), - ) - .replace("{link}", i.link().unwrap_or_default()) - .replace( - "{pub_date}", - &time(i.pub_date(), &feed.pub_date_format), - ), - "\n\n", - ) - .to_string() - }) - .collect::(), - ) - .as_bytes(), - )? - } - - Ok(()) -} - -fn time(value: Option<&str>, format: &str) -> String { - match value { - Some(v) => DateTime::parse_from_rfc2822(v).unwrap(), - None => Local::now().into(), - } - .format(format) - .to_string() -} diff --git a/template/gmi/index.gmi b/template/gmi/index.gmi deleted file mode 100644 index c6cbf99..0000000 --- a/template/gmi/index.gmi +++ /dev/null @@ -1,7 +0,0 @@ -# {title} - -{description} - -## {time_generated} - -{items} \ No newline at end of file diff --git a/template/gmi/index/item.gmi b/template/gmi/index/item.gmi deleted file mode 100644 index dea579f..0000000 --- a/template/gmi/index/item.gmi +++ /dev/null @@ -1,6 +0,0 @@ - -### {title} - -{description} - -=> {link} {pub_date} diff --git a/template/html/index.html b/template/html/index.html deleted file mode 100644 index 1d16c3e..0000000 --- a/template/html/index.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - {title} - - - -
-

{title}

- {description} -
-
- {items} -
-
-

- Source: {title} | - Updated: {pub_date} | - Build: {last_build_date} | - Generated: {time_generated} -

-

- Powered by rssto. -

-
- - \ No newline at end of file diff --git a/template/html/index/item.html b/template/html/index/item.html deleted file mode 100644 index b60abde..0000000 --- a/template/html/index/item.html +++ /dev/null @@ -1,5 +0,0 @@ - \ No newline at end of file From 31065eb4137098ef2cc8dcf99aa8c7161cc89231 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 15:50:44 +0200 Subject: [PATCH 02/65] add initial db structure --- crates/mysql/database/0.1.0.sql | 91 +++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 crates/mysql/database/0.1.0.sql diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql new file mode 100644 index 0000000..555f283 --- /dev/null +++ b/crates/mysql/database/0.1.0.sql @@ -0,0 +1,91 @@ +-- MySQL Script generated by MySQL Workbench +-- Wed 07 Jan 2026 03:49:23 PM EET +-- Model: New Model Version: 1.0 +-- MySQL Workbench Forward Engineering + +SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; +SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; +SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'; + +-- ----------------------------------------------------- +-- Schema rssto +-- ----------------------------------------------------- + +-- ----------------------------------------------------- +-- Schema rssto +-- ----------------------------------------------------- +CREATE SCHEMA IF NOT EXISTS `rssto` ; +USE `rssto` ; + +-- ----------------------------------------------------- +-- Table `rssto`.`channel` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel` ( + `channel_id` INT NOT NULL AUTO_INCREMENT, + `url` VARCHAR(255) NOT NULL, + PRIMARY KEY (`channel_id`), + UNIQUE INDEX `url_UNIQUE` (`url` ASC) VISIBLE) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` ( + `channel_item_id` INT NOT NULL AUTO_INCREMENT, + `channel_id` INT NOT NULL, + `pub_date` BIGINT NOT NULL, + `link` VARCHAR(255) NOT NULL, + `title` VARCHAR(255) NULL, + `description` LONGTEXT NULL, + PRIMARY KEY (`channel_item_id`), + UNIQUE INDEX `url_UNIQUE` (`link` ASC) VISIBLE, + INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_channel` + FOREIGN KEY (`channel_id`) + REFERENCES `rssto`.`channel` (`channel_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`source` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`source` ( + `source_id` INT NOT NULL AUTO_INCREMENT, + `name` VARCHAR(255) NOT NULL, + PRIMARY KEY (`source_id`), + UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`content` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`content` ( + `content_id` BIGINT NOT NULL AUTO_INCREMENT, + `channel_item_id` INT NOT NULL, + `source_id` INT NULL, + `title` VARCHAR(255) NOT NULL, + `description` LONGTEXT NOT NULL, + PRIMARY KEY (`content_id`), + INDEX `fk_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE, + INDEX `fk_content_source_idx` (`source_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `source_id` ASC) VISIBLE, + CONSTRAINT `fk_content_channel_item` + FOREIGN KEY (`channel_item_id`) + REFERENCES `rssto`.`channel_item` (`channel_item_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_content_source` + FOREIGN KEY (`source_id`) + REFERENCES `rssto`.`source` (`source_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +SET SQL_MODE=@OLD_SQL_MODE; +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; From d669f1ba78d17074e266cc5a714ae7fa363f2059 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 15:56:25 +0200 Subject: [PATCH 03/65] add comments --- crates/crawler/config/example.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/crawler/config/example.toml b/crates/crawler/config/example.toml index a5b62dc..a6366d4 100644 --- a/crates/crawler/config/example.toml +++ b/crates/crawler/config/example.toml @@ -1,5 +1,8 @@ -update = 60 +# Rescan feed channels time, in seconds +update = 900 +# Database connection setup +# * see crates/mysql/database [mysql] host = "localhost" port = 3307 @@ -7,6 +10,7 @@ user = "" password = "" database = "rssto" +# Content sources (unlimited) [[channel]] url = "https://" items_limit = 20 From 63373609da69b59ac0eb6e063ae46c4d27fc644b Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 16:13:57 +0200 Subject: [PATCH 04/65] fix default port --- crates/crawler/config/example.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/crawler/config/example.toml b/crates/crawler/config/example.toml index a6366d4..fac36cf 100644 --- a/crates/crawler/config/example.toml +++ b/crates/crawler/config/example.toml @@ -5,7 +5,7 @@ update = 900 # * see crates/mysql/database [mysql] host = "localhost" -port = 3307 +port = 3306 user = "" password = "" database = "rssto" From 6bf89cbc3e2e07a0fb738dfa313152674bc713ff Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 16:19:11 +0200 Subject: [PATCH 05/65] add missed `guid` field, update unique index members and name --- crates/mysql/database/0.1.0.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql index 555f283..d35b99d 100644 --- a/crates/mysql/database/0.1.0.sql +++ b/crates/mysql/database/0.1.0.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- Wed 07 Jan 2026 03:49:23 PM EET +-- Wed 07 Jan 2026 04:18:03 PM EET -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -35,12 +35,13 @@ CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` ( `channel_item_id` INT NOT NULL AUTO_INCREMENT, `channel_id` INT NOT NULL, `pub_date` BIGINT NOT NULL, + `guid` VARCHAR(255) NOT NULL, `link` VARCHAR(255) NOT NULL, `title` VARCHAR(255) NULL, `description` LONGTEXT NULL, PRIMARY KEY (`channel_item_id`), - UNIQUE INDEX `url_UNIQUE` (`link` ASC) VISIBLE, INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`guid` ASC, `channel_id` ASC) VISIBLE, CONSTRAINT `fk_channel_item_channel` FOREIGN KEY (`channel_id`) REFERENCES `rssto`.`channel` (`channel_id`) From d8f2d723f52b94544834d22be8c0c566dcbc69e8 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 18:25:21 +0200 Subject: [PATCH 06/65] handle some 3'th party errors, init scraper features --- crates/crawler/Cargo.toml | 3 +- crates/crawler/config/example.toml | 8 ++- crates/crawler/src/config.rs | 10 +++- crates/crawler/src/main.rs | 83 ++++++++++++++++++++++++------ 4 files changed, 84 insertions(+), 20 deletions(-) diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml index 148545a..11d6062 100644 --- a/crates/crawler/Cargo.toml +++ b/crates/crawler/Cargo.toml @@ -17,7 +17,8 @@ log = "0.4.29" mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" } reqwest = { version = "0.13.1", features = ["blocking"] } rss = "2.0.12" +scraper = { version = "0.25.0", features = ["serde"] } serde = { version = "1.0.228", features = ["derive"] } toml = "0.9.10" tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } -url = { version = "2.5.8", features = ["serde"] } +url = { version = "2.5.8", features = ["serde"] } \ No newline at end of file diff --git a/crates/crawler/config/example.toml b/crates/crawler/config/example.toml index fac36cf..bde12ba 100644 --- a/crates/crawler/config/example.toml +++ b/crates/crawler/config/example.toml @@ -16,9 +16,15 @@ url = "https://" items_limit = 20 persist_item_title = true persist_item_description = true +# optional: +# content_title_selector = "h1" +# content_description_selector = "article" [[channel]] url = "https://" items_limit = 20 persist_item_title = true -persist_item_description = true \ No newline at end of file +persist_item_description = true +# optional: +# content_title_selector = "h1" +# content_description_selector = "article" \ No newline at end of file diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs index e9ac4ac..dc325b5 100644 --- a/crates/crawler/src/config.rs +++ b/crates/crawler/src/config.rs @@ -1,3 +1,4 @@ +use scraper::Selector; use serde::Deserialize; use url::Url; @@ -20,14 +21,19 @@ pub struct Channel { pub persist_item_title: bool, /// Save item description pub persist_item_description: bool, + /// Scrape title by CSS selector + /// * None to ignore + pub content_title_selector: Option, + /// Scrape description by CSS selector + /// * None to ignore + pub content_description_selector: Option, } #[derive(Debug, Deserialize)] pub struct Config { pub mysql: Mysql, pub channel: Vec, - /// Update timeout in seconds - /// + /// Channels update timeout in seconds /// * None to generate once pub update: Option, } diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 01aa48e..7cdfc0b 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -4,6 +4,7 @@ mod config; use anyhow::Result; use log::{debug, info, warn}; use mysql::Mysql; +use reqwest::blocking::get; fn main() -> Result<()> { use argument::Argument; @@ -27,7 +28,6 @@ fn main() -> Result<()> { let argument = Argument::parse(); let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?; - let mut database = Mysql::connect( &config.mysql.host, config.mysql.port, @@ -39,10 +39,10 @@ fn main() -> Result<()> { info!("Crawler started"); loop { debug!("Begin new crawl queue..."); - for feed in &config.channel { - debug!("Update `{}`...", feed.url); - if let Err(e) = crawl(&mut database, feed) { - warn!("Feed `{}` update failed: `{e}`", feed.url) + for c in &config.channel { + debug!("Update `{}`...", c.url); + if let Err(e) = crawl(&mut database, c) { + warn!("Channel `{}` update failed: `{e}`", c.url) } } debug!("Crawl queue completed"); @@ -56,31 +56,50 @@ fn main() -> Result<()> { } fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { - use reqwest::blocking::get; use rss::Channel; + use scraper::Selector; + + // shared local helpers + fn scrape(url: &str, selector: &Selector) -> Result> { + let document = scraper::Html::parse_document(&get(url)?.text()?); + Ok(if let Some(first) = document.select(selector).next() { + Some(first.inner_html()) + } else { + warn!("Could not scrape requested inner"); + None + }) + } + + // allocate once + let channel_url = channel_config.url.to_string(); + + let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { + Ok(response) => response.into_items(), + Err(e) => { + warn!("Could not parse response from `{channel_url}`: `{e}`"); + return Ok(()); + } + }; - let channel = Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..])?; - let channel_items = channel.items(); let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); - let feed_url = channel_config.url.to_string(); - let channel_id = match db.channels_by_url(&feed_url, Some(1))?.first() { + let channel_id = match db.channels_by_url(&channel_url, Some(1))?.first() { Some(result) => result.channel_id, - None => db.insert_channel(&feed_url)?, + None => db.insert_channel(&channel_url)?, }; for channel_item in channel_items.iter().take(channel_items_limit) { let guid = match channel_item.guid { Some(ref guid) => guid.value.clone(), None => { - warn!("Undefined `guid` field in `{feed_url}`"); + warn!("Undefined `guid` field in `{channel_url}`"); continue; } }; let link = match channel_item.guid { Some(ref link) => link.value.clone(), None => { - warn!("Undefined `link` field in `{feed_url}`"); + warn!("Undefined `link` field in `{channel_url}`"); continue; } }; @@ -88,12 +107,12 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) { Ok(t) => t.timestamp(), Err(e) => { - warn!("Invalid `pub_date` field in `{feed_url}`: `{e}`"); + warn!("Invalid `pub_date` field in `{channel_url}`: `{e}`"); continue; } }, None => { - warn!("Undefined `pub_date` field in `{feed_url}`"); + warn!("Undefined `pub_date` field in `{channel_url}`"); continue; } }; @@ -118,7 +137,39 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { None }, )?, - }; // @TODO + }; + + // @TODO preload remote content + + let title = match channel_config.content_title_selector { + Some(ref selector) => match scrape(&link, selector) { + Ok(value) => value, + Err(e) => { + warn!("Could not update `title` selector in `{channel_url}`: `{e}`"); + continue; + } + }, + None => None, + }; + + let description = match channel_config.content_description_selector { + Some(ref selector) => match scrape(&link, selector) { + Ok(value) => value, + Err(e) => { + warn!("Could not update `description` selector in `{channel_url}`: `{e}`"); + continue; + } + }, + None => None, + }; + + if title.is_none() && description.is_none() { + continue; + } + + // @TODO insert content record + + println!("{:?}", description) } Ok(()) } From 480cd21e73db4bcafd42983f476941cb8496aeca Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 18:25:44 +0200 Subject: [PATCH 07/65] fix field asset and type --- crates/mysql/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 7286a34..92ffb27 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -50,7 +50,7 @@ impl Mysql { ) -> Result, Error> { self.connection.exec_map( format!( - "SELECT `channel_item_id`, `channel_id`, `guid`, `link`, `title`, `description` FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ? LIMIT {}", + "SELECT `channel_item_id`, `channel_id`, `pub_date`, `guid`, `link`, `title`, `description` FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ? LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT)), ( channel_id, @@ -117,7 +117,7 @@ pub struct Channel { pub struct ChannelItem { pub channel_item_id: u64, pub channel_id: u64, - pub pub_date: i32, + pub pub_date: i64, pub guid: String, pub link: String, pub title: Option, From c0734731cb1b4500091d0cf1ba12f45ee68f2886 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 19:02:00 +0200 Subject: [PATCH 08/65] disallow nullable title/description values for the content table, implement `contents_by_channel_item_id_source_id`, return last insert id for `insert_content`, fix `content_id` data type, implement initial content version save on crawl --- crates/crawler/src/main.rs | 52 ++++++++++++++++++++++++++++---------- crates/mysql/src/lib.rs | 26 ++++++++++++++++--- 2 files changed, 60 insertions(+), 18 deletions(-) diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 7cdfc0b..fe86502 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -139,37 +139,61 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { )?, }; - // @TODO preload remote content + // preload remote content let title = match channel_config.content_title_selector { Some(ref selector) => match scrape(&link, selector) { - Ok(value) => value, + Ok(value) => match value { + Some(title) => title, + None => { + warn!("Could not scrape `title` selector in `{channel_url}`"); + continue; + } + }, Err(e) => { warn!("Could not update `title` selector in `{channel_url}`: `{e}`"); continue; } }, - None => None, + None => match channel_item.title { + Some(ref title) => title.clone(), + None => { + warn!( + "Could not assign `title` from channel item for content in `{channel_url}`" + ); + continue; + } + }, }; - let description = match channel_config.content_description_selector { Some(ref selector) => match scrape(&link, selector) { - Ok(value) => value, + Ok(value) => match value { + Some(description) => description, + None => { + warn!("Could not scrape `description` selector in `{channel_url}`"); + continue; + } + }, Err(e) => { warn!("Could not update `description` selector in `{channel_url}`: `{e}`"); continue; } }, - None => None, + None => match channel_item.description { + Some(ref description) => description.clone(), + None => { + warn!( + "Could not assign `description` from channel item for content in `{channel_url}`" + ); + continue; + } + }, }; - - if title.is_none() && description.is_none() { - continue; - } - - // @TODO insert content record - - println!("{:?}", description) + assert!( + db.contents_by_channel_item_id_source_id(channel_item_id, None, Some(1))? + .is_empty() + ); + let _content_id = db.insert_content(channel_item_id, None, title, description)?; } Ok(()) } diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 92ffb27..4219d50 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -93,17 +93,34 @@ impl Mysql { Ok(self.connection.last_insert_id()) } + pub fn contents_by_channel_item_id_source_id( + &mut self, + channel_item_id: u64, + source_id: Option, + limit: Option, + ) -> Result, Error> { + self.connection.exec_map( + format!( + "SELECT `content_id`, `channel_item_id`, `source_id`, `title`, `description` FROM `content` WHERE `channel_item_id` = ? AND `source_id` = ? LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) + ), + (channel_item_id, source_id), + |(content_id, channel_item_id,source_id, title, description)| Content { content_id, channel_item_id, source_id, title, description }, + ) + } + pub fn insert_content( &mut self, channel_item_id: u64, source_id: Option, - title: &str, - description: &str, - ) -> Result<(), Error> { + title: String, + description: String, + ) -> Result { self.connection.exec_drop( "INSERT INTO `content` SET `channel_item_id` = ?, `source_id` = ?, `title` = ?, `description` = ?", (channel_item_id, source_id, title, description ), - ) + )?; + Ok(self.connection.last_insert_id()) } } @@ -126,6 +143,7 @@ pub struct ChannelItem { #[derive(Debug, PartialEq, Eq)] pub struct Content { + pub content_id: u64, pub channel_item_id: u64, /// None if the original `title` and `description` values /// parsed from the channel item on crawl From 259ac118dcdf6c9ed4d4722890a88cce6d5d17d8 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 19:17:47 +0200 Subject: [PATCH 09/65] fix channel item links with its data type --- crates/crawler/src/main.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index fe86502..6aee166 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -90,14 +90,14 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { for channel_item in channel_items.iter().take(channel_items_limit) { let guid = match channel_item.guid { - Some(ref guid) => guid.value.clone(), + Some(ref guid) => guid.value.as_ref(), None => { warn!("Undefined `guid` field in `{channel_url}`"); continue; } }; - let link = match channel_item.guid { - Some(ref link) => link.value.clone(), + let link = match channel_item.link { + Some(ref link) => link, None => { warn!("Undefined `link` field in `{channel_url}`"); continue; @@ -117,15 +117,15 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { } }; let channel_item_id = match db - .channel_items_by_channel_id_guid(channel_id, &guid, Some(1))? + .channel_items_by_channel_id_guid(channel_id, guid, Some(1))? .first() { Some(result) => result.channel_item_id, None => db.insert_channel_item( channel_id, pub_date, - &guid, - &link, + guid, + link, if channel_config.persist_item_title { channel_item.title() } else { @@ -142,7 +142,7 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { // preload remote content let title = match channel_config.content_title_selector { - Some(ref selector) => match scrape(&link, selector) { + Some(ref selector) => match scrape(link, selector) { Ok(value) => match value { Some(title) => title, None => { @@ -166,7 +166,7 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { }, }; let description = match channel_config.content_description_selector { - Some(ref selector) => match scrape(&link, selector) { + Some(ref selector) => match scrape(link, selector) { Ok(value) => match value { Some(description) => description, None => { From 776de04c1db3be5c5477f765cdc9cf59b5ae0dd9 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 19:27:36 +0200 Subject: [PATCH 10/65] skip processed channel items --- crates/crawler/src/main.rs | 49 ++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 6aee166..ab8ccbe 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -59,7 +59,7 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { use rss::Channel; use scraper::Selector; - // shared local helpers + /// local helper fn scrape(url: &str, selector: &Selector) -> Result> { let document = scraper::Html::parse_document(&get(url)?.text()?); Ok(if let Some(first) = document.select(selector).next() { @@ -70,8 +70,7 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { }) } - // allocate once - let channel_url = channel_config.url.to_string(); + let channel_url = channel_config.url.to_string(); // allocate once let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { Ok(response) => response.into_items(), @@ -116,31 +115,29 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { continue; } }; - let channel_item_id = match db + if !db .channel_items_by_channel_id_guid(channel_id, guid, Some(1))? - .first() + .is_empty() { - Some(result) => result.channel_item_id, - None => db.insert_channel_item( - channel_id, - pub_date, - guid, - link, - if channel_config.persist_item_title { - channel_item.title() - } else { - None - }, - if channel_config.persist_item_description { - channel_item.description() - } else { - None - }, - )?, - }; - - // preload remote content - + continue; // skip next steps as processed + } + let channel_item_id = db.insert_channel_item( + channel_id, + pub_date, + guid, + link, + if channel_config.persist_item_title { + channel_item.title() + } else { + None + }, + if channel_config.persist_item_description { + channel_item.description() + } else { + None + }, + )?; + // preload remote content.. let title = match channel_config.content_title_selector { Some(ref selector) => match scrape(link, selector) { Ok(value) => match value { From feb3a2d519568e91d642ce3ce7e4b04b0770edf5 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 19:30:12 +0200 Subject: [PATCH 11/65] add comment --- crates/crawler/src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index ab8ccbe..2c5c414 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -191,6 +191,7 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { .is_empty() ); let _content_id = db.insert_content(channel_item_id, None, title, description)?; + // @TODO preload media } Ok(()) } From 9269ec2a9e354b9b7dc40121b065e2e392f96788 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 20:53:37 +0200 Subject: [PATCH 12/65] add readme --- README.md | 7 ++++- crates/crawler/README.md | 58 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 crates/crawler/README.md diff --git a/README.md b/README.md index f114474..9b14499 100644 --- a/README.md +++ b/README.md @@ -7,4 +7,9 @@ Convert RSS feeds into multiple formats > [!NOTE] -> Branch in development! \ No newline at end of file +> Branch in development! + +## Components + +* `rssto-mysql` - shared database library +* `rssto-crawler` - RSS feed reader and data scrapper daemon \ No newline at end of file diff --git a/crates/crawler/README.md b/crates/crawler/README.md new file mode 100644 index 0000000..9ea940f --- /dev/null +++ b/crates/crawler/README.md @@ -0,0 +1,58 @@ +# rssto-crawler + +## Install + +``` bash +git clone https://github.com/YGGverse/rssto.git +cd rssto +cargo build --release +``` + +## Launch + +``` bash +rssto-crawler -c config/example.toml +``` +> [!TIP] +> * prepend `RUST_LOG=rssto_crawler=trace` to print worker details (supported [levels](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.LevelFilter.html)) +> * or just `RUST_LOG=trace` to debug all components in use +> * append `-u TIME` to run as the daemon with `TIME` interval update +> * see `rssto-crawler --help` to print all available options + +### Systemd + +1. Install `rssto-crawler` by copy the binary compiled into the native system apps destination: + * Linux: `sudo install target/release/rssto-crawler /usr/local/bin/rssto-crawler` +2. Create `systemd` configuration file at `/etc/systemd/system/rssto-crawler.service`: + +``` rssto-crawler.service +[Unit] +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple + +User=rssto +Group=rssto + +# Uncomment for debug +# Environment="RUST_LOG=DEBUG" +# Environment="NO_COLOR=1" + +ExecStart=/usr/local/bin/rssto-crawler -c /path/to/config.toml + +StandardOutput=file:///home/rssto/crawler-debug.log +StandardError=file:///home/rssto/crawler-error.log + +[Install] +WantedBy=multi-user.target +``` +* example above requires new system user (`useradd -m rssto`) + +3. Run in priority: + +* `systemctl daemon-reload` - reload systemd configuration +* `systemctl enable rssto-crawler` - enable new service +* `systemctl start rssto-crawler` - start the process +* `systemctl status rssto-crawler` - check process launched \ No newline at end of file From 4af10ee153943db72330b8ece821d1040840993b Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 21:28:01 +0200 Subject: [PATCH 13/65] implement `contents` method --- crates/mysql/src/lib.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 4219d50..b0fff0e 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -93,6 +93,16 @@ impl Mysql { Ok(self.connection.last_insert_id()) } + pub fn contents(&mut self, limit: Option) -> Result, Error> { + self.connection.query_map( + format!( + "SELECT `content_id`, `channel_item_id`, `source_id`, `title`, `description` FROM `content` LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) + ), + |(content_id, channel_item_id,source_id, title, description)| Content { content_id, channel_item_id, source_id, title, description }, + ) + } + pub fn contents_by_channel_item_id_source_id( &mut self, channel_item_id: u64, From 6d3aac409ac85aad62b237c6d81dce024e850191 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 21:56:16 +0200 Subject: [PATCH 14/65] implement `channel_item` getter --- crates/mysql/src/lib.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index b0fff0e..1b58530 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -42,6 +42,25 @@ impl Mysql { Ok(self.connection.last_insert_id()) } + pub fn channel_item(&mut self, channel_item_id: u64) -> Result, Error> { + self.connection.exec_first( + "SELECT `channel_item_id`, `channel_id`, `pub_date`, `guid`, `link`, `title`, `description` FROM `channel_item` WHERE `channel_item_id` = ?", + (channel_item_id,) + ).map(|row| { + row.map(|(channel_item_id, channel_id, pub_date, guid, link, title, description)| { + ChannelItem { + channel_item_id, + channel_id, + pub_date, + guid, + link, + title, + description, + } + }) + }) + } + pub fn channel_items_by_channel_id_guid( &mut self, channel_id: u64, From 98ec67175828170e23a47e570586105d5cc02e0d Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 21:59:11 +0200 Subject: [PATCH 15/65] format long rows --- crates/mysql/src/lib.rs | 94 ++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 35 deletions(-) diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 1b58530..2267156 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -43,22 +43,32 @@ impl Mysql { } pub fn channel_item(&mut self, channel_item_id: u64) -> Result, Error> { - self.connection.exec_first( - "SELECT `channel_item_id`, `channel_id`, `pub_date`, `guid`, `link`, `title`, `description` FROM `channel_item` WHERE `channel_item_id` = ?", - (channel_item_id,) - ).map(|row| { - row.map(|(channel_item_id, channel_id, pub_date, guid, link, title, description)| { - ChannelItem { - channel_item_id, - channel_id, - pub_date, - guid, - link, - title, - description, - } + self.connection + .exec_first( + "SELECT `channel_item_id`, + `channel_id`, + `pub_date`, + `guid`, + `link`, + `title`, + `description` FROM `channel_item` WHERE `channel_item_id` = ?", + (channel_item_id,), + ) + .map(|row| { + row.map( + |(channel_item_id, channel_id, pub_date, guid, link, title, description)| { + ChannelItem { + channel_item_id, + channel_id, + pub_date, + guid, + link, + title, + description, + } + }, + ) }) - }) } pub fn channel_items_by_channel_id_guid( @@ -69,29 +79,24 @@ impl Mysql { ) -> Result, Error> { self.connection.exec_map( format!( - "SELECT `channel_item_id`, `channel_id`, `pub_date`, `guid`, `link`, `title`, `description` FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ? LIMIT {}", - limit.unwrap_or(DEFAULT_LIMIT)), - ( - channel_id, - guid + "SELECT `channel_item_id`, + `channel_id`, + `pub_date`, + `guid`, + `link`, + `title`, + `description` FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ? LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) ), - |( + (channel_id, guid), + |(channel_item_id, channel_id, pub_date, guid, link, title, description)| ChannelItem { channel_item_id, channel_id, pub_date, guid, link, title, - description - )| - ChannelItem { - channel_item_id, - channel_id, - pub_date, - guid, - link, - title, - description + description, }, ) } @@ -106,7 +111,12 @@ impl Mysql { description: Option<&str>, ) -> Result { self.connection.exec_drop( - "INSERT INTO `channel_item` SET `channel_id` = ?, `pub_date` = ?, `guid` = ?, `link` = ?, `title` = ?, `description` = ?", + "INSERT INTO `channel_item` SET `channel_id` = ?, + `pub_date` = ?, + `guid` = ?, + `link` = ?, + `title` = ?, + `description` = ?", (channel_id, pub_date, guid, link, title, description), )?; Ok(self.connection.last_insert_id()) @@ -115,10 +125,20 @@ impl Mysql { pub fn contents(&mut self, limit: Option) -> Result, Error> { self.connection.query_map( format!( - "SELECT `content_id`, `channel_item_id`, `source_id`, `title`, `description` FROM `content` LIMIT {}", + "SELECT `content_id`, + `channel_item_id`, + `source_id`, + `title`, + `description` FROM `content` LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) ), - |(content_id, channel_item_id,source_id, title, description)| Content { content_id, channel_item_id, source_id, title, description }, + |(content_id, channel_item_id, source_id, title, description)| Content { + content_id, + channel_item_id, + source_id, + title, + description, + }, ) } @@ -130,7 +150,11 @@ impl Mysql { ) -> Result, Error> { self.connection.exec_map( format!( - "SELECT `content_id`, `channel_item_id`, `source_id`, `title`, `description` FROM `content` WHERE `channel_item_id` = ? AND `source_id` = ? LIMIT {}", + "SELECT `content_id`, + `channel_item_id`, + `source_id`, + `title`, + `description` FROM `content` WHERE `channel_item_id` = ? AND `source_id` = ? LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) ), (channel_item_id, source_id), From e7e3969e001c589dc980674314b9990abaf81a44 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 22:20:57 +0200 Subject: [PATCH 16/65] remove self mutable dependency as pool --- crates/mysql/src/lib.rs | 57 +++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 2267156..9e21990 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -1,7 +1,7 @@ -use mysql::{Error, PooledConn, prelude::Queryable}; +use mysql::{Error, Pool, prelude::Queryable}; pub struct Mysql { - connection: PooledConn, + pool: Pool, } impl Mysql { @@ -13,19 +13,14 @@ impl Mysql { database: &str, ) -> Result { Ok(Self { - connection: mysql::Pool::new( + pool: mysql::Pool::new( format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(), - )? - .get_conn()?, + )?, }) } - pub fn channels_by_url( - &mut self, - url: &str, - limit: Option, - ) -> Result, Error> { - self.connection.exec_map( + pub fn channels_by_url(&self, url: &str, limit: Option) -> Result, Error> { + self.pool.get_conn()?.exec_map( format!( "SELECT `channel_id`, `url` FROM `channel` WHERE `url` = ? LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) @@ -35,15 +30,15 @@ impl Mysql { ) } - pub fn insert_channel(&mut self, url: &str) -> Result { - self.connection - .exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?; - - Ok(self.connection.last_insert_id()) + pub fn insert_channel(&self, url: &str) -> Result { + let mut c = self.pool.get_conn()?; + c.exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?; + Ok(c.last_insert_id()) } - pub fn channel_item(&mut self, channel_item_id: u64) -> Result, Error> { - self.connection + pub fn channel_item(&self, channel_item_id: u64) -> Result, Error> { + self.pool + .get_conn()? .exec_first( "SELECT `channel_item_id`, `channel_id`, @@ -72,12 +67,12 @@ impl Mysql { } pub fn channel_items_by_channel_id_guid( - &mut self, + &self, channel_id: u64, guid: &str, limit: Option, ) -> Result, Error> { - self.connection.exec_map( + self.pool.get_conn()?.exec_map( format!( "SELECT `channel_item_id`, `channel_id`, @@ -102,7 +97,7 @@ impl Mysql { } pub fn insert_channel_item( - &mut self, + &self, channel_id: u64, pub_date: i64, guid: &str, @@ -110,7 +105,8 @@ impl Mysql { title: Option<&str>, description: Option<&str>, ) -> Result { - self.connection.exec_drop( + let mut c = self.pool.get_conn()?; + c.exec_drop( "INSERT INTO `channel_item` SET `channel_id` = ?, `pub_date` = ?, `guid` = ?, @@ -119,11 +115,11 @@ impl Mysql { `description` = ?", (channel_id, pub_date, guid, link, title, description), )?; - Ok(self.connection.last_insert_id()) + Ok(c.last_insert_id()) } - pub fn contents(&mut self, limit: Option) -> Result, Error> { - self.connection.query_map( + pub fn contents(&self, limit: Option) -> Result, Error> { + self.pool.get_conn()?.query_map( format!( "SELECT `content_id`, `channel_item_id`, @@ -143,12 +139,12 @@ impl Mysql { } pub fn contents_by_channel_item_id_source_id( - &mut self, + &self, channel_item_id: u64, source_id: Option, limit: Option, ) -> Result, Error> { - self.connection.exec_map( + self.pool.get_conn()?.exec_map( format!( "SELECT `content_id`, `channel_item_id`, @@ -163,17 +159,18 @@ impl Mysql { } pub fn insert_content( - &mut self, + &self, channel_item_id: u64, source_id: Option, title: String, description: String, ) -> Result { - self.connection.exec_drop( + let mut c = self.pool.get_conn()?; + c.exec_drop( "INSERT INTO `content` SET `channel_item_id` = ?, `source_id` = ?, `title` = ?, `description` = ?", (channel_item_id, source_id, title, description ), )?; - Ok(self.connection.last_insert_id()) + Ok(c.last_insert_id()) } } From eedc9c06fc016126eb285652a74fff5946cf2435 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 22:22:08 +0200 Subject: [PATCH 17/65] remove self mutable dependency as pool --- crates/crawler/src/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 2c5c414..c221040 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -28,7 +28,7 @@ fn main() -> Result<()> { let argument = Argument::parse(); let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?; - let mut database = Mysql::connect( + let database = Mysql::connect( &config.mysql.host, config.mysql.port, &config.mysql.user, @@ -41,7 +41,7 @@ fn main() -> Result<()> { debug!("Begin new crawl queue..."); for c in &config.channel { debug!("Update `{}`...", c.url); - if let Err(e) = crawl(&mut database, c) { + if let Err(e) = crawl(&database, c) { warn!("Channel `{}` update failed: `{e}`", c.url) } } @@ -55,7 +55,7 @@ fn main() -> Result<()> { } } -fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { +fn crawl(db: &Mysql, channel_config: &config::Channel) -> Result<()> { use rss::Channel; use scraper::Selector; From c52f960cbe4a5dae1487183f4bf462d3b070e950 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 22:33:50 +0200 Subject: [PATCH 18/65] implement `contents_total` method --- crates/mysql/src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 9e21990..87ac3ca 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -118,6 +118,14 @@ impl Mysql { Ok(c.last_insert_id()) } + pub fn contents_total(&self) -> Result { + let total: Option = self + .pool + .get_conn()? + .query_first("SELECT COUNT(*) FROM `content`")?; + Ok(total.unwrap_or(0)) + } + pub fn contents(&self, limit: Option) -> Result, Error> { self.pool.get_conn()?.query_map( format!( From 5e0735ebe10220abb02242880ec74effc709b409 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 22:56:57 +0200 Subject: [PATCH 19/65] implement `content` method --- crates/mysql/src/lib.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 87ac3ca..1c16878 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -1,4 +1,7 @@ -use mysql::{Error, Pool, prelude::Queryable}; +use mysql::{ + Error, Pool, + prelude::{FromRow, Queryable}, +}; pub struct Mysql { pool: Pool, @@ -118,6 +121,17 @@ impl Mysql { Ok(c.last_insert_id()) } + pub fn content(&self, content_id: u64) -> Result, Error> { + self.pool.get_conn()?.exec_first( + "SELECT `content_id`, + `channel_item_id`, + `source_id`, + `title`, + `description` FROM `content` WHERE `content_id` = ?", + (content_id,), + ) + } + pub fn contents_total(&self) -> Result { let total: Option = self .pool @@ -199,7 +213,7 @@ pub struct ChannelItem { pub description: Option, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, FromRow)] pub struct Content { pub content_id: u64, pub channel_item_id: u64, From 4c99208535372e98c5a082fff4b564b0ff3c0f9e Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 23:00:10 +0200 Subject: [PATCH 20/65] use `FromRow` trait --- crates/mysql/src/lib.rs | 80 ++++++++++++----------------------------- 1 file changed, 22 insertions(+), 58 deletions(-) diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 1c16878..5e52fe5 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -40,33 +40,16 @@ impl Mysql { } pub fn channel_item(&self, channel_item_id: u64) -> Result, Error> { - self.pool - .get_conn()? - .exec_first( - "SELECT `channel_item_id`, - `channel_id`, - `pub_date`, - `guid`, - `link`, - `title`, - `description` FROM `channel_item` WHERE `channel_item_id` = ?", - (channel_item_id,), - ) - .map(|row| { - row.map( - |(channel_item_id, channel_id, pub_date, guid, link, title, description)| { - ChannelItem { - channel_item_id, - channel_id, - pub_date, - guid, - link, - title, - description, - } - }, - ) - }) + self.pool.get_conn()?.exec_first( + "SELECT `channel_item_id`, + `channel_id`, + `pub_date`, + `guid`, + `link`, + `title`, + `description` FROM `channel_item` WHERE `channel_item_id` = ?", + (channel_item_id,), + ) } pub fn channel_items_by_channel_id_guid( @@ -75,7 +58,7 @@ impl Mysql { guid: &str, limit: Option, ) -> Result, Error> { - self.pool.get_conn()?.exec_map( + self.pool.get_conn()?.exec( format!( "SELECT `channel_item_id`, `channel_id`, @@ -87,15 +70,6 @@ impl Mysql { limit.unwrap_or(DEFAULT_LIMIT) ), (channel_id, guid), - |(channel_item_id, channel_id, pub_date, guid, link, title, description)| ChannelItem { - channel_item_id, - channel_id, - pub_date, - guid, - link, - title, - description, - }, ) } @@ -141,23 +115,14 @@ impl Mysql { } pub fn contents(&self, limit: Option) -> Result, Error> { - self.pool.get_conn()?.query_map( - format!( - "SELECT `content_id`, - `channel_item_id`, - `source_id`, - `title`, - `description` FROM `content` LIMIT {}", - limit.unwrap_or(DEFAULT_LIMIT) - ), - |(content_id, channel_item_id, source_id, title, description)| Content { - content_id, - channel_item_id, - source_id, - title, - description, - }, - ) + self.pool.get_conn()?.query(format!( + "SELECT `content_id`, + `channel_item_id`, + `source_id`, + `title`, + `description` FROM `content` LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) + )) } pub fn contents_by_channel_item_id_source_id( @@ -166,7 +131,7 @@ impl Mysql { source_id: Option, limit: Option, ) -> Result, Error> { - self.pool.get_conn()?.exec_map( + self.pool.get_conn()?.exec( format!( "SELECT `content_id`, `channel_item_id`, @@ -176,7 +141,6 @@ impl Mysql { limit.unwrap_or(DEFAULT_LIMIT) ), (channel_item_id, source_id), - |(content_id, channel_item_id,source_id, title, description)| Content { content_id, channel_item_id, source_id, title, description }, ) } @@ -196,13 +160,13 @@ impl Mysql { } } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, FromRow)] pub struct Channel { pub channel_id: u64, pub url: String, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, FromRow)] pub struct ChannelItem { pub channel_item_id: u64, pub channel_id: u64, From 353c78b2f029bab852234eb46a968dcc6b31023a Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 23:25:02 +0200 Subject: [PATCH 21/65] draft initial http application --- Cargo.toml | 1 + crates/http/Cargo.toml | 17 +++ crates/http/LICENSE | 21 +++ crates/http/README.md | 3 + crates/http/src/config.rs | 50 +++++++ crates/http/src/feed.rs | 58 ++++++++ crates/http/src/global.rs | 8 ++ crates/http/src/main.rs | 181 +++++++++++++++++++++++++ crates/http/src/meta.rs | 9 ++ crates/http/templates/index.html.tera | 24 ++++ crates/http/templates/info.html.tera | 12 ++ crates/http/templates/layout.html.tera | 22 +++ 12 files changed, 406 insertions(+) create mode 100644 crates/http/Cargo.toml create mode 100644 crates/http/LICENSE create mode 100644 crates/http/README.md create mode 100644 crates/http/src/config.rs create mode 100644 crates/http/src/feed.rs create mode 100644 crates/http/src/global.rs create mode 100644 crates/http/src/main.rs create mode 100644 crates/http/src/meta.rs create mode 100644 crates/http/templates/index.html.tera create mode 100644 crates/http/templates/info.html.tera create mode 100644 crates/http/templates/layout.html.tera diff --git a/Cargo.toml b/Cargo.toml index 22e1445..d5a938d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,5 +2,6 @@ resolver = "2" members = [ "crates/crawler", + "crates/http", "crates/mysql", ] \ No newline at end of file diff --git a/crates/http/Cargo.toml b/crates/http/Cargo.toml new file mode 100644 index 0000000..e2cfbf8 --- /dev/null +++ b/crates/http/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "rssto-http" +version = "0.1.0" +edition = "2024" +license = "MIT" +readme = "README.md" +description = "Web server for the rssto DB, based on Rocket engine" +keywords = ["rss", "aggregator", "http", "server"] +categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"] +repository = "https://github.com/YGGverse/rssto" + +[dependencies] +chrono = { version = "0.4.41", features = ["serde"] } +clap = { version = "4.5.54", features = ["derive"] } +mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" } +rocket = "0.5.1" +rocket_dyn_templates = { version = "0.2.0", features = ["tera"] } diff --git a/crates/http/LICENSE b/crates/http/LICENSE new file mode 100644 index 0000000..a9c0006 --- /dev/null +++ b/crates/http/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 YGGverse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/http/README.md b/crates/http/README.md new file mode 100644 index 0000000..b92db47 --- /dev/null +++ b/crates/http/README.md @@ -0,0 +1,3 @@ +# rssto-http + +Web server implementation based on the Rocket engine diff --git a/crates/http/src/config.rs b/crates/http/src/config.rs new file mode 100644 index 0000000..7b53c13 --- /dev/null +++ b/crates/http/src/config.rs @@ -0,0 +1,50 @@ +use clap::Parser; +use std::net::{IpAddr, Ipv4Addr}; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +pub struct Config { + /// Server name + #[arg(long, default_value_t = String::from("rssto"))] + pub title: String, + + /// Server description + #[arg(long)] + pub description: Option, + + /// Format timestamps (on the web view) + /// + /// * tip: escape with `%%d/%%m/%%Y %%H:%%M` in the CLI/bash argument + #[arg(long, default_value_t = String::from("%d/%m/%Y %H:%M"))] + pub format_time: String, + + /// Default listing limit + #[arg(long, default_value_t = 20)] + pub list_limit: usize, + + /// Default capacity (estimated torrents in the `public` directory) + #[arg(long, default_value_t = 1000)] + pub capacity: usize, + + /// Bind server on given host + #[arg(long, default_value_t = IpAddr::V4(Ipv4Addr::LOCALHOST))] + pub host: IpAddr, + + /// Bind server on given port + #[arg(long, short, default_value_t = 8000)] + pub port: u16, + + /// Configure instance in the debug mode + #[arg(long, default_value_t = false)] + pub debug: bool, + + // Database + #[arg(long, default_value_t = String::from("localhost"))] + pub mysql_host: String, + #[arg(long, default_value_t = 3306)] + pub mysql_port: u16, + + pub mysql_user: String, + pub mysql_password: String, + pub mysql_database: String, +} diff --git a/crates/http/src/feed.rs b/crates/http/src/feed.rs new file mode 100644 index 0000000..73d582e --- /dev/null +++ b/crates/http/src/feed.rs @@ -0,0 +1,58 @@ +/// Export crawl index to the RSS file +pub struct Feed { + buffer: String, +} + +impl Feed { + pub fn new(title: &str, description: Option<&str>, capacity: usize) -> Self { + let t = chrono::Utc::now().to_rfc2822(); + let mut buffer = String::with_capacity(capacity); + + buffer.push_str(""); + + buffer.push_str(&format!("{t}")); + buffer.push_str(&format!("{t}")); + buffer.push_str(&format!("{}", escape(title))); + + if let Some(d) = description { + buffer.push_str(&format!("{}", escape(d))); + } + + Self { buffer } + } + + /// Append `item` to the feed `channel` + pub fn push( + &mut self, + guid: u64, + time: chrono::DateTime, + url: String, + title: String, + description: String, + ) { + self.buffer.push_str(&format!( + "{guid}{}{url}{}{}", + escape(&title), + escape(&description), + time.to_rfc2822() + )) + } + + /// Write final bytes + pub fn commit(mut self) -> String { + self.buffer.push_str(""); + self.buffer + } +} + +// @TODO use tera filters? +// https://keats.github.io/tera/docs/#built-in-filters + +fn escape(value: &str) -> String { + value + .replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) + .replace("'", "'") +} diff --git a/crates/http/src/global.rs b/crates/http/src/global.rs new file mode 100644 index 0000000..933e67e --- /dev/null +++ b/crates/http/src/global.rs @@ -0,0 +1,8 @@ +use rocket::serde::Serialize; + +#[derive(Clone, Debug, Serialize)] +#[serde(crate = "rocket::serde")] +pub struct Global { + pub list_limit: usize, + pub format_time: String, +} diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs new file mode 100644 index 0000000..43f8db0 --- /dev/null +++ b/crates/http/src/main.rs @@ -0,0 +1,181 @@ +#[macro_use] +extern crate rocket; + +mod config; +mod feed; +mod global; +mod meta; + +use chrono::{DateTime, Utc}; +use config::Config; +use feed::Feed; +use global::Global; +use meta::Meta; +use mysql::Mysql; +use rocket::{State, http::Status, response::content::RawXml, serde::Serialize}; +use rocket_dyn_templates::{Template, context}; + +#[get("/?&")] +fn index( + search: Option<&str>, + page: Option, + db: &State, + meta: &State, + global: &State, +) -> Result { + #[derive(Serialize)] + #[serde(crate = "rocket::serde")] + struct Content { + content_id: u64, + description: String, + link: String, + time: String, + title: String, + } + let total = db.contents_total().map_err(|e| { + error!("Could not get contents total: `{e}`"); + Status::InternalServerError + })?; + Ok(Template::render( + "index", + context! { + title: { + let mut t = String::new(); + if let Some(q) = search && !q.is_empty() { + t.push_str(q); + t.push_str(S); + t.push_str("Search"); + t.push_str(S) + } + if let Some(p) = page && p > 1 { + t.push_str(&format!("Page {p}")); + t.push_str(S) + } + t.push_str(&meta.title); + if let Some(ref description) = meta.description + && page.is_none_or(|p| p == 1) && search.is_none_or(|q| q.is_empty()) { + t.push_str(S); + t.push_str(description) + } + t + }, + meta: meta.inner(), + back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))), + next: if page.unwrap_or(1) * global.list_limit >= total { None } + else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) }, + rows: db.contents(Some(global.list_limit)).map_err(|e| { + error!("Could not get contents: `{e}`"); + Status::InternalServerError + })? + .into_iter() + .map(|c| { + let channel_item = db.channel_item(c.channel_item_id).unwrap().unwrap(); + Content { + content_id: c.content_id, + description: c.description, + link: channel_item.link, + time: time(channel_item.pub_date).format(&global.format_time).to_string(), + title: c.title, + } + }) + .collect::>(), + page: page.unwrap_or(1), + pages: (total as f64 / global.list_limit as f64).ceil(), + total, + search + }, + )) +} + +#[get("/")] +fn info( + content_id: u64, + db: &State, + meta: &State, + global: &State, +) -> Result { + match db.content(content_id).map_err(|e| { + error!("Could not get content `{content_id}`: `{e}`"); + Status::InternalServerError + })? { + Some(c) => { + let i = db.channel_item(c.channel_item_id).unwrap().unwrap(); + Ok(Template::render( + "info", + context! { + title: format!("{}{S}{}", c.title, meta.title), + description: c.description, + link: i.link, + time: time(i.pub_date).format(&global.format_time).to_string(), + }, + )) + } + None => Err(Status::NotFound), + } +} + +#[get("/rss")] +fn rss(meta: &State, db: &State) -> Result, Status> { + let mut f = Feed::new( + &meta.title, + meta.description.as_deref(), + 1024, // @TODO + ); + for c in db + .contents(Some(20)) // @TODO + .map_err(|e| { + error!("Could not load channel item contents: `{e}`"); + Status::InternalServerError + })? + { + let channel_item = db.channel_item(c.channel_item_id).unwrap().unwrap(); + f.push( + c.channel_item_id, + time(channel_item.pub_date), + channel_item.link, + c.title, + c.description, + ) + } + Ok(RawXml(f.commit())) +} + +#[launch] +fn rocket() -> _ { + use clap::Parser; + let config = Config::parse(); + rocket::build() + .attach(Template::fairing()) + .configure(rocket::Config { + port: config.port, + address: config.host, + ..if config.debug { + rocket::Config::debug_default() + } else { + rocket::Config::release_default() + } + }) + .manage(Mysql::connect( + &config.mysql_host, + config.mysql_port, + &config.mysql_user, + &config.mysql_password, + &config.mysql_database, + )) + .manage(Global { + format_time: config.format_time, + list_limit: config.list_limit, + }) + .manage(Meta { + description: config.description, + title: config.title, + version: env!("CARGO_PKG_VERSION").into(), + }) + .mount("/", routes![index, rss, info]) +} + +const S: &str = " • "; + +fn time(timestamp: i64) -> DateTime { + DateTime::::from_timestamp(timestamp, 0).unwrap() +} diff --git a/crates/http/src/meta.rs b/crates/http/src/meta.rs new file mode 100644 index 0000000..c8512d0 --- /dev/null +++ b/crates/http/src/meta.rs @@ -0,0 +1,9 @@ +use rocket::serde::Serialize; + +#[derive(Clone, Debug, Serialize)] +#[serde(crate = "rocket::serde")] +pub struct Meta { + pub description: Option, + pub title: String, + pub version: String, +} diff --git a/crates/http/templates/index.html.tera b/crates/http/templates/index.html.tera new file mode 100644 index 0000000..9c6c87f --- /dev/null +++ b/crates/http/templates/index.html.tera @@ -0,0 +1,24 @@ +{% extends "layout" %} +{% block content %} + {% if rows %} + {% for row in rows %} +
+ +

{{ row.title }}

+ {% if row.time %}

{{ row.time }}

{% endif %} +
+ {{ row.description }} +
+
+ {% endfor %} + {% else %} +
Nothing.
+ {% endif %} + {% if next %}Next{% endif %} + {% if back %}Back{% endif %} + {% if total %} + + Page {{ page }} / {{ pages }} ({{ total }} torrent{{ total | pluralize(plural="s") }} total) + + {% endif %} +{% endblock content %} \ No newline at end of file diff --git a/crates/http/templates/info.html.tera b/crates/http/templates/info.html.tera new file mode 100644 index 0000000..a2516d9 --- /dev/null +++ b/crates/http/templates/info.html.tera @@ -0,0 +1,12 @@ +{% extends "layout" %} +{% block content %} +
+

{{ title }}

+
+ {{ description }} +
+ +
+{% endblock content %} \ No newline at end of file diff --git a/crates/http/templates/layout.html.tera b/crates/http/templates/layout.html.tera new file mode 100644 index 0000000..7049747 --- /dev/null +++ b/crates/http/templates/layout.html.tera @@ -0,0 +1,22 @@ + + + + + {{ title }} + {% if meta.description %} + + {% endif %} + + +
+ {{ meta.title }} +
+ + +
+
+
+ {% block content %}{% endblock content %} +
+ + \ No newline at end of file From 5ac1f29f5fe5ef23c590c9ec4d27760043a80353 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 23:27:00 +0200 Subject: [PATCH 22/65] update components list --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9b14499..7650e13 100644 --- a/README.md +++ b/README.md @@ -11,5 +11,6 @@ Convert RSS feeds into multiple formats ## Components -* `rssto-mysql` - shared database library -* `rssto-crawler` - RSS feed reader and data scrapper daemon \ No newline at end of file +* `rssto-crawler` - RSS feed reader and data scrapper daemon +* `rssto-http` - Web server implementation based on the Rocket engine +* `rssto-mysql` - Shared database library From d176ee87ea8bb5468a0ad409c7e04ee1edd8a9ca Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 23:28:30 +0200 Subject: [PATCH 23/65] update components list --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7650e13..52c2748 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Convert RSS feeds into multiple formats ## Components -* `rssto-crawler` - RSS feed reader and data scrapper daemon -* `rssto-http` - Web server implementation based on the Rocket engine -* `rssto-mysql` - Shared database library +* [x] `rssto-crawler` - RSS feed reader and data scrapper daemon +* [x] `rssto-http` - Web server implementation based on the Rocket engine +* [x] `rssto-mysql` - Shared database library +* [ ] `rssto-llama` - Feeds auto-translation tool \ No newline at end of file From 621dad3810cc6363f1b57ff5bcbfb60963e4a20a Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 23:52:15 +0200 Subject: [PATCH 24/65] update config name --- crates/http/src/config.rs | 2 +- crates/http/src/main.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/http/src/config.rs b/crates/http/src/config.rs index 7b53c13..1017ad3 100644 --- a/crates/http/src/config.rs +++ b/crates/http/src/config.rs @@ -44,7 +44,7 @@ pub struct Config { #[arg(long, default_value_t = 3306)] pub mysql_port: u16, - pub mysql_user: String, + pub mysql_username: String, pub mysql_password: String, pub mysql_database: String, } diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 43f8db0..f95647c 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -158,7 +158,7 @@ fn rocket() -> _ { .manage(Mysql::connect( &config.mysql_host, config.mysql_port, - &config.mysql_user, + &config.mysql_username, &config.mysql_password, &config.mysql_database, )) From 71e6a97e378f8dafc025d45e8b41e7ae90dca563 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 23:55:24 +0200 Subject: [PATCH 25/65] add missed annotations --- crates/http/src/config.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/http/src/config.rs b/crates/http/src/config.rs index 1017ad3..db36793 100644 --- a/crates/http/src/config.rs +++ b/crates/http/src/config.rs @@ -43,8 +43,10 @@ pub struct Config { pub mysql_host: String, #[arg(long, default_value_t = 3306)] pub mysql_port: u16, - + #[arg(long)] pub mysql_username: String, + #[arg(long)] pub mysql_password: String, + #[arg(long)] pub mysql_database: String, } From 5f5c77f360dd02bd77c62f9e87bc216cd2e0005f Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 23:56:13 +0200 Subject: [PATCH 26/65] add launch example --- crates/http/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/http/README.md b/crates/http/README.md index b92db47..33d5584 100644 --- a/crates/http/README.md +++ b/crates/http/README.md @@ -1,3 +1,12 @@ # rssto-http Web server implementation based on the Rocket engine + +> [!NOTE] +> In development! + +``` +cargo run -p rssto-http -- --mysql-username USER \ + --mysql-password PASS \ + --mysql-database NAME +``` \ No newline at end of file From a53417ce5237029348be83742d2866ef8e1e9bd4 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:07:11 +0200 Subject: [PATCH 27/65] fix mysql management entry --- crates/http/src/main.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index f95647c..eabbc97 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -155,13 +155,16 @@ fn rocket() -> _ { rocket::Config::release_default() } }) - .manage(Mysql::connect( - &config.mysql_host, - config.mysql_port, - &config.mysql_username, - &config.mysql_password, - &config.mysql_database, - )) + .manage( + Mysql::connect( + &config.mysql_host, + config.mysql_port, + &config.mysql_username, + &config.mysql_password, + &config.mysql_database, + ) + .unwrap(), + ) .manage(Global { format_time: config.format_time, list_limit: config.list_limit, From 847b64e9356c4bc8c5d7d5c50e14279b508fb4b1 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:07:32 +0200 Subject: [PATCH 28/65] update example --- crates/http/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/http/README.md b/crates/http/README.md index 33d5584..3034cb8 100644 --- a/crates/http/README.md +++ b/crates/http/README.md @@ -6,7 +6,8 @@ Web server implementation based on the Rocket engine > In development! ``` -cargo run -p rssto-http -- --mysql-username USER \ - --mysql-password PASS \ - --mysql-database NAME +cd rssto/crates/rssto-http +cargo run -- --mysql-username USER \ + --mysql-password PASS \ + --mysql-database NAME ``` \ No newline at end of file From 5ee46f6df6e4dcab443e4b625fed0b066b074efa Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:10:50 +0200 Subject: [PATCH 29/65] fix internal reference --- crates/http/templates/index.html.tera | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/http/templates/index.html.tera b/crates/http/templates/index.html.tera index 9c6c87f..9325d73 100644 --- a/crates/http/templates/index.html.tera +++ b/crates/http/templates/index.html.tera @@ -4,7 +4,7 @@ {% for row in rows %}
-

{{ row.title }}

+

{{ row.title }}

{% if row.time %}

{{ row.time }}

{% endif %}
{{ row.description }} From 6818c1458087d641bb6b1d7b21d8788389e2fd1e Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:12:51 +0200 Subject: [PATCH 30/65] add missed meta member --- crates/http/src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index eabbc97..27035d9 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -103,6 +103,7 @@ fn info( Ok(Template::render( "info", context! { + meta: meta.inner(), title: format!("{}{S}{}", c.title, meta.title), description: c.description, link: i.link, From add90d66d721043da480942ce3fe45e2e39223fc Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:13:29 +0200 Subject: [PATCH 31/65] reorder source link --- crates/http/templates/info.html.tera | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/crates/http/templates/info.html.tera b/crates/http/templates/info.html.tera index a2516d9..441446d 100644 --- a/crates/http/templates/info.html.tera +++ b/crates/http/templates/info.html.tera @@ -2,11 +2,9 @@ {% block content %}

{{ title }}

+ {{ time }}
{{ description }}
-
{% endblock content %} \ No newline at end of file From d63d99d392fe18d2df05ee4830fc53e1bbee46cf Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:14:03 +0200 Subject: [PATCH 32/65] add safe filter --- crates/http/templates/info.html.tera | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/http/templates/info.html.tera b/crates/http/templates/info.html.tera index 441446d..28f8b07 100644 --- a/crates/http/templates/info.html.tera +++ b/crates/http/templates/info.html.tera @@ -4,7 +4,7 @@

{{ title }}

{{ time }}
- {{ description }} + {{ description | safe }}
{% endblock content %} \ No newline at end of file From c324246cc6520efbb471fcb0f29772c5046ce344 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:14:32 +0200 Subject: [PATCH 33/65] add safe filter --- crates/http/templates/index.html.tera | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/http/templates/index.html.tera b/crates/http/templates/index.html.tera index 9325d73..8c36d2f 100644 --- a/crates/http/templates/index.html.tera +++ b/crates/http/templates/index.html.tera @@ -7,7 +7,7 @@

{{ row.title }}

{% if row.time %}

{{ row.time }}

{% endif %}
- {{ row.description }} + {{ row.description | safe }}
{% endfor %} From 5d232f54c9859aa653d64b57df0ac5d40d277a57 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:22:07 +0200 Subject: [PATCH 34/65] implement default sort ordering features --- crates/http/src/main.rs | 6 +++--- crates/mysql/src/lib.rs | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 27035d9..f88b15e 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -11,7 +11,7 @@ use config::Config; use feed::Feed; use global::Global; use meta::Meta; -use mysql::Mysql; +use mysql::{Mysql, Sort}; use rocket::{State, http::Status, response::content::RawXml, serde::Serialize}; use rocket_dyn_templates::{Template, context}; @@ -63,7 +63,7 @@ fn index( back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))), next: if page.unwrap_or(1) * global.list_limit >= total { None } else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) }, - rows: db.contents(Some(global.list_limit)).map_err(|e| { + rows: db.contents(Sort::Desc, Some(global.list_limit)).map_err(|e| { error!("Could not get contents: `{e}`"); Status::InternalServerError })? @@ -123,7 +123,7 @@ fn rss(meta: &State, db: &State) -> Result, Status> 1024, // @TODO ); for c in db - .contents(Some(20)) // @TODO + .contents(Sort::Desc, Some(20)) // @TODO .map_err(|e| { error!("Could not load channel item contents: `{e}`"); Status::InternalServerError diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 5e52fe5..32f8018 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -114,13 +114,13 @@ impl Mysql { Ok(total.unwrap_or(0)) } - pub fn contents(&self, limit: Option) -> Result, Error> { + pub fn contents(&self, sort: Sort, limit: Option) -> Result, Error> { self.pool.get_conn()?.query(format!( "SELECT `content_id`, `channel_item_id`, `source_id`, `title`, - `description` FROM `content` LIMIT {}", + `description` FROM `content` ORDER BY `content_id` {sort} LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) )) } @@ -188,4 +188,18 @@ pub struct Content { pub description: String, } +pub enum Sort { + Asc, + Desc, +} + +impl std::fmt::Display for Sort { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Self::Asc => write!(f, "ASC"), + Self::Desc => write!(f, "DESC"), + } + } +} + const DEFAULT_LIMIT: usize = 100; From af3a6d0cd6e5da501f47d5e7938a3b0558c75644 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:24:38 +0200 Subject: [PATCH 35/65] cleanup --- crates/http/src/config.rs | 4 ---- crates/http/templates/index.html.tera | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/http/src/config.rs b/crates/http/src/config.rs index db36793..5247a45 100644 --- a/crates/http/src/config.rs +++ b/crates/http/src/config.rs @@ -22,10 +22,6 @@ pub struct Config { #[arg(long, default_value_t = 20)] pub list_limit: usize, - /// Default capacity (estimated torrents in the `public` directory) - #[arg(long, default_value_t = 1000)] - pub capacity: usize, - /// Bind server on given host #[arg(long, default_value_t = IpAddr::V4(Ipv4Addr::LOCALHOST))] pub host: IpAddr, diff --git a/crates/http/templates/index.html.tera b/crates/http/templates/index.html.tera index 8c36d2f..d695261 100644 --- a/crates/http/templates/index.html.tera +++ b/crates/http/templates/index.html.tera @@ -18,7 +18,7 @@ {% if back %}Back{% endif %} {% if total %} - Page {{ page }} / {{ pages }} ({{ total }} torrent{{ total | pluralize(plural="s") }} total) + Page {{ page }} / {{ pages }} ({{ total }} total) {% endif %} {% endblock content %} \ No newline at end of file From 54b3ad7d89a1f20745e002fe339fd36b6cbbdd34 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:25:32 +0200 Subject: [PATCH 36/65] add header tag --- crates/http/templates/layout.html.tera | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/http/templates/layout.html.tera b/crates/http/templates/layout.html.tera index 7049747..a722e34 100644 --- a/crates/http/templates/layout.html.tera +++ b/crates/http/templates/layout.html.tera @@ -9,7 +9,7 @@
- {{ meta.title }} +

{{ meta.title }}

From 7b9fac8d3a37b3ce14ae84a7f3dc570272985342 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:27:48 +0200 Subject: [PATCH 37/65] separate entries --- crates/http/src/main.rs | 5 +++-- crates/http/templates/info.html.tera | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index f88b15e..ff5be61 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -103,10 +103,11 @@ fn info( Ok(Template::render( "info", context! { - meta: meta.inner(), - title: format!("{}{S}{}", c.title, meta.title), description: c.description, link: i.link, + meta: meta.inner(), + title: format!("{}{S}{}", c.title, meta.title), + name: c.title, time: time(i.pub_date).format(&global.format_time).to_string(), }, )) diff --git a/crates/http/templates/info.html.tera b/crates/http/templates/info.html.tera index 28f8b07..deeaddf 100644 --- a/crates/http/templates/info.html.tera +++ b/crates/http/templates/info.html.tera @@ -1,7 +1,7 @@ {% extends "layout" %} {% block content %}
-

{{ title }}

+

{{ name }}

{{ time }}
{{ description | safe }} From e26236a5916b40a7d38143becc4cb09b398b2a39 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:42:26 +0200 Subject: [PATCH 38/65] rename table --- crates/crawler/src/main.rs | 2 +- crates/mysql/database/0.1.0.sql | 22 +++++++++++----------- crates/mysql/src/lib.rs | 22 +++++++++++----------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index c221040..eb57623 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -187,7 +187,7 @@ fn crawl(db: &Mysql, channel_config: &config::Channel) -> Result<()> { }, }; assert!( - db.contents_by_channel_item_id_source_id(channel_item_id, None, Some(1))? + db.contents_by_channel_item_id_provider_id(channel_item_id, None, Some(1))? .is_empty() ); let _content_id = db.insert_content(channel_item_id, None, title, description)?; diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql index d35b99d..5eb70ff 100644 --- a/crates/mysql/database/0.1.0.sql +++ b/crates/mysql/database/0.1.0.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- Wed 07 Jan 2026 04:18:03 PM EET +-- Thu 08 Jan 2026 12:40:45 AM EET -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -51,12 +51,12 @@ ENGINE = InnoDB; -- ----------------------------------------------------- --- Table `rssto`.`source` +-- Table `rssto`.`provider` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `rssto`.`source` ( - `source_id` INT NOT NULL AUTO_INCREMENT, +CREATE TABLE IF NOT EXISTS `rssto`.`provider` ( + `provider_id` INT NOT NULL AUTO_INCREMENT, `name` VARCHAR(255) NOT NULL, - PRIMARY KEY (`source_id`), + PRIMARY KEY (`provider_id`), UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE) ENGINE = InnoDB; @@ -67,21 +67,21 @@ ENGINE = InnoDB; CREATE TABLE IF NOT EXISTS `rssto`.`content` ( `content_id` BIGINT NOT NULL AUTO_INCREMENT, `channel_item_id` INT NOT NULL, - `source_id` INT NULL, + `provider_id` INT NULL, `title` VARCHAR(255) NOT NULL, `description` LONGTEXT NOT NULL, PRIMARY KEY (`content_id`), INDEX `fk_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE, - INDEX `fk_content_source_idx` (`source_id` ASC) VISIBLE, - UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `source_id` ASC) VISIBLE, + INDEX `fk_content_provider_idx` (`provider_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE, CONSTRAINT `fk_content_channel_item` FOREIGN KEY (`channel_item_id`) REFERENCES `rssto`.`channel_item` (`channel_item_id`) ON DELETE NO ACTION ON UPDATE NO ACTION, - CONSTRAINT `fk_content_source` - FOREIGN KEY (`source_id`) - REFERENCES `rssto`.`source` (`source_id`) + CONSTRAINT `fk_content_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) ON DELETE NO ACTION ON UPDATE NO ACTION) ENGINE = InnoDB; diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 32f8018..21b6f9d 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -99,7 +99,7 @@ impl Mysql { self.pool.get_conn()?.exec_first( "SELECT `content_id`, `channel_item_id`, - `source_id`, + `provider_id`, `title`, `description` FROM `content` WHERE `content_id` = ?", (content_id,), @@ -118,43 +118,43 @@ impl Mysql { self.pool.get_conn()?.query(format!( "SELECT `content_id`, `channel_item_id`, - `source_id`, + `provider_id`, `title`, `description` FROM `content` ORDER BY `content_id` {sort} LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) )) } - pub fn contents_by_channel_item_id_source_id( + pub fn contents_by_channel_item_id_provider_id( &self, channel_item_id: u64, - source_id: Option, + provider_id: Option, limit: Option, ) -> Result, Error> { self.pool.get_conn()?.exec( format!( "SELECT `content_id`, `channel_item_id`, - `source_id`, + `provider_id`, `title`, - `description` FROM `content` WHERE `channel_item_id` = ? AND `source_id` = ? LIMIT {}", + `description` FROM `content` WHERE `channel_item_id` = ? AND `provider_id` = ? LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) ), - (channel_item_id, source_id), + (channel_item_id, provider_id), ) } pub fn insert_content( &self, channel_item_id: u64, - source_id: Option, + provider_id: Option, title: String, description: String, ) -> Result { let mut c = self.pool.get_conn()?; c.exec_drop( - "INSERT INTO `content` SET `channel_item_id` = ?, `source_id` = ?, `title` = ?, `description` = ?", - (channel_item_id, source_id, title, description ), + "INSERT INTO `content` SET `channel_item_id` = ?, `provider_id` = ?, `title` = ?, `description` = ?", + (channel_item_id, provider_id, title, description ), )?; Ok(c.last_insert_id()) } @@ -183,7 +183,7 @@ pub struct Content { pub channel_item_id: u64, /// None if the original `title` and `description` values /// parsed from the channel item on crawl - pub source_id: Option, + pub provider_id: Option, pub title: String, pub description: String, } From 7a284474389c6c75eb70bab40bf01f49587012c6 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:53:47 +0200 Subject: [PATCH 39/65] update description --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 52c2748..e9ba5b5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Dependencies](https://deps.rs/repo/github/YGGverse/rssto/status.svg)](https://deps.rs/repo/github/YGGverse/rssto) [![crates.io](https://img.shields.io/crates/v/rssto.svg)](https://crates.io/crates/rssto) -Convert RSS feeds into multiple formats +Crawl content from RSS feeds into multiple formats > [!NOTE] > Branch in development! From aaad4fd49d2ae2b26df48b9c28e1a595cac3d4bb Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 00:57:25 +0200 Subject: [PATCH 40/65] update systemd option example --- crates/crawler/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/crawler/README.md b/crates/crawler/README.md index 9ea940f..99fa260 100644 --- a/crates/crawler/README.md +++ b/crates/crawler/README.md @@ -37,7 +37,7 @@ User=rssto Group=rssto # Uncomment for debug -# Environment="RUST_LOG=DEBUG" +# Environment="RUST_LOG=rssto_crawler=debug" # Environment="NO_COLOR=1" ExecStart=/usr/local/bin/rssto-crawler -c /path/to/config.toml From 5608e2e0810dc9680ec51f0bd3a2bdf84647e9dd Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 11:12:42 +0200 Subject: [PATCH 41/65] implement `provider_id` filter --- crates/http/src/config.rs | 5 +++++ crates/http/src/global.rs | 3 ++- crates/http/src/main.rs | 21 ++++++++++++++------- crates/mysql/src/lib.rs | 24 +++++++++++++++--------- 4 files changed, 36 insertions(+), 17 deletions(-) diff --git a/crates/http/src/config.rs b/crates/http/src/config.rs index 5247a45..56c3f41 100644 --- a/crates/http/src/config.rs +++ b/crates/http/src/config.rs @@ -18,6 +18,11 @@ pub struct Config { #[arg(long, default_value_t = String::from("%d/%m/%Y %H:%M"))] pub format_time: String, + /// Provider ID (`provider` table) + /// * None for the original content + #[arg(long, short)] + pub provider_id: Option, + /// Default listing limit #[arg(long, default_value_t = 20)] pub list_limit: usize, diff --git a/crates/http/src/global.rs b/crates/http/src/global.rs index 933e67e..8e25ad0 100644 --- a/crates/http/src/global.rs +++ b/crates/http/src/global.rs @@ -3,6 +3,7 @@ use rocket::serde::Serialize; #[derive(Clone, Debug, Serialize)] #[serde(crate = "rocket::serde")] pub struct Global { - pub list_limit: usize, pub format_time: String, + pub list_limit: usize, + pub provider_id: Option, } diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index ff5be61..1e3302d 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -32,10 +32,12 @@ fn index( time: String, title: String, } - let total = db.contents_total().map_err(|e| { - error!("Could not get contents total: `{e}`"); - Status::InternalServerError - })?; + let total = db + .contents_total_by_provider_id(global.provider_id) + .map_err(|e| { + error!("Could not get contents total: `{e}`"); + Status::InternalServerError + })?; Ok(Template::render( "index", context! { @@ -63,7 +65,7 @@ fn index( back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))), next: if page.unwrap_or(1) * global.list_limit >= total { None } else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) }, - rows: db.contents(Sort::Desc, Some(global.list_limit)).map_err(|e| { + rows: db.contents_by_provider_id(global.provider_id, Sort::Desc, Some(global.list_limit)).map_err(|e| { error!("Could not get contents: `{e}`"); Status::InternalServerError })? @@ -117,14 +119,18 @@ fn info( } #[get("/rss")] -fn rss(meta: &State, db: &State) -> Result, Status> { +fn rss( + global: &State, + meta: &State, + db: &State, +) -> Result, Status> { let mut f = Feed::new( &meta.title, meta.description.as_deref(), 1024, // @TODO ); for c in db - .contents(Sort::Desc, Some(20)) // @TODO + .contents_by_provider_id(global.provider_id, Sort::Desc, Some(20)) // @TODO .map_err(|e| { error!("Could not load channel item contents: `{e}`"); Status::InternalServerError @@ -170,6 +176,7 @@ fn rocket() -> _ { .manage(Global { format_time: config.format_time, list_limit: config.list_limit, + provider_id: config.provider_id, }) .manage(Meta { description: config.description, diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 21b6f9d..e601545 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -106,23 +106,29 @@ impl Mysql { ) } - pub fn contents_total(&self) -> Result { - let total: Option = self - .pool - .get_conn()? - .query_first("SELECT COUNT(*) FROM `content`")?; + pub fn contents_total_by_provider_id(&self, provider_id: Option) -> Result { + let total: Option = self.pool.get_conn()?.exec_first( + "SELECT COUNT(*) FROM `content` WHERE `provider_id` = ?", + (provider_id,), + )?; Ok(total.unwrap_or(0)) } - pub fn contents(&self, sort: Sort, limit: Option) -> Result, Error> { - self.pool.get_conn()?.query(format!( + pub fn contents_by_provider_id( + &self, + provider_id: Option, + sort: Sort, + limit: Option, + ) -> Result, Error> { + self.pool.get_conn()?.exec(format!( "SELECT `content_id`, `channel_item_id`, `provider_id`, `title`, - `description` FROM `content` ORDER BY `content_id` {sort} LIMIT {}", + `description` FROM `content` WHERE `provider_id` = ? ORDER BY `content_id` {sort} LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) - )) + ), + (provider_id, )) } pub fn contents_by_channel_item_id_provider_id( From de3fda435a410184250e4a620480cfa248bb8d62 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 14:05:16 +0200 Subject: [PATCH 42/65] implement initial llm features --- Cargo.toml | 1 + README.md | 8 +-- crates/crawler/src/main.rs | 2 +- crates/http/README.md | 9 ++-- crates/llm/Cargo.toml | 20 +++++++ crates/llm/LICENSE | 21 ++++++++ crates/llm/README.md | 28 ++++++++++ crates/llm/src/argument.rs | 37 +++++++++++++ crates/llm/src/main.rs | 103 +++++++++++++++++++++++++++++++++++++ crates/mysql/src/lib.rs | 54 +++++++++++++++++-- 10 files changed, 269 insertions(+), 14 deletions(-) create mode 100644 crates/llm/Cargo.toml create mode 100644 crates/llm/LICENSE create mode 100644 crates/llm/README.md create mode 100644 crates/llm/src/argument.rs create mode 100644 crates/llm/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index d5a938d..e510071 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,5 +3,6 @@ resolver = "2" members = [ "crates/crawler", "crates/http", + "crates/llm", "crates/mysql", ] \ No newline at end of file diff --git a/README.md b/README.md index e9ba5b5..ef2f5d3 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Crawl content from RSS feeds into multiple formats ## Components -* [x] `rssto-crawler` - RSS feed reader and data scrapper daemon -* [x] `rssto-http` - Web server implementation based on the Rocket engine -* [x] `rssto-mysql` - Shared database library -* [ ] `rssto-llama` - Feeds auto-translation tool \ No newline at end of file +* `rssto-crawler` - RSS feed reader and data scrapper daemon +* `rssto-http` - Web server implementation based on the Rocket engine +* `rssto-llm` - Feeds auto-translation +* `rssto-mysql` - Shared database library \ No newline at end of file diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index eb57623..9156f0f 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -190,7 +190,7 @@ fn crawl(db: &Mysql, channel_config: &config::Channel) -> Result<()> { db.contents_by_channel_item_id_provider_id(channel_item_id, None, Some(1))? .is_empty() ); - let _content_id = db.insert_content(channel_item_id, None, title, description)?; + let _content_id = db.insert_content(channel_item_id, None, &title, &description)?; // @TODO preload media } Ok(()) diff --git a/crates/http/README.md b/crates/http/README.md index 3034cb8..c9edc0e 100644 --- a/crates/http/README.md +++ b/crates/http/README.md @@ -7,7 +7,8 @@ Web server implementation based on the Rocket engine ``` cd rssto/crates/rssto-http -cargo run -- --mysql-username USER \ - --mysql-password PASS \ - --mysql-database NAME -``` \ No newline at end of file +cargo run -- --mysql-username {USER} \ + --mysql-password {PASS} \ + --mysql-database {NAME} +``` +* optionally, use `--provider-id {ID}` to filter content using post-processing results (e.g. generated by the `rssto-llm` crate) \ No newline at end of file diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml new file mode 100644 index 0000000..7bc1b53 --- /dev/null +++ b/crates/llm/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "rssto-llm" +version = "0.1.0" +edition = "2024" +license = "MIT" +readme = "README.md" +description = "LLM daemon for the rssto DB translations" +keywords = ["rss", "llm", "translation", "localization", "server"] +categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"] +repository = "https://github.com/YGGverse/rssto" + +[dependencies] +anyhow = "1.0.100" +chrono = "0.4.42" +clap = { version = "4.5.54", features = ["derive"] } +lancor = "0.1.1" +log = "0.4.29" +mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" } +tokio = { version = "1.0", features = ["full"] } +tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } \ No newline at end of file diff --git a/crates/llm/LICENSE b/crates/llm/LICENSE new file mode 100644 index 0000000..a9c0006 --- /dev/null +++ b/crates/llm/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 YGGverse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/llm/README.md b/crates/llm/README.md new file mode 100644 index 0000000..901fb66 --- /dev/null +++ b/crates/llm/README.md @@ -0,0 +1,28 @@ +# rssto-llm + +LLM daemon for the rssto DB translations + +> [!NOTE] +> In development! + +1. Setup `rssto-crawler` first and collect initial data + +2. Run LLM server: + +``` +llama-server -hf ggml-org/gemma-3-1b-it-GGUF` +``` + +3. Launch `rssto-llm` to handle `content` DB: + +``` +cd rssto/crates/rssto-llm +cargo run -- --mysql-username {USER} \ + --mysql-password {PASS} \ + --mysql-database {NAME} \ + --llm-host {HOST} \ + --llm-port {PORT} \ + --llm-model {MODEL} \ + --llm-message {MESSAGE} +``` +* see `--help` to display all supported options \ No newline at end of file diff --git a/crates/llm/src/argument.rs b/crates/llm/src/argument.rs new file mode 100644 index 0000000..a3cc51d --- /dev/null +++ b/crates/llm/src/argument.rs @@ -0,0 +1,37 @@ +use clap::Parser; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +pub struct Argument { + // LLM + #[arg(long, default_value_t = String::from("http"))] + pub llm_scheme: String, + #[arg(long, default_value_t = String::from("localhost"))] + pub llm_host: String, + #[arg(long, default_value_t = 8080)] + pub llm_port: u16, + + /// Model name (e.g. `INSAIT-Institute/MamayLM-Gemma-2-9B-IT-v0.1-GGUF` or `ggml-org/gemma-3-1b-it-GGUF`) + #[arg(long)] + pub llm_model: String, + + /// Initial message for the `content` subject (e.g. `translate to...`) + #[arg(long)] + pub llm_message: String, + + // Database + #[arg(long, default_value_t = String::from("localhost"))] + pub mysql_host: String, + #[arg(long, default_value_t = 3306)] + pub mysql_port: u16, + #[arg(long)] + pub mysql_username: String, + #[arg(long)] + pub mysql_password: String, + #[arg(long)] + pub mysql_database: String, + /// Loop update in seconds + /// * None to exit on complete + #[arg(long, short)] + pub update: Option, +} diff --git a/crates/llm/src/main.rs b/crates/llm/src/main.rs new file mode 100644 index 0000000..8510253 --- /dev/null +++ b/crates/llm/src/main.rs @@ -0,0 +1,103 @@ +mod argument; + +use anyhow::Result; + +#[tokio::main] +async fn main() -> Result<()> { + use argument::Argument; + use chrono::Local; + use clap::Parser; + use lancor::{ChatCompletionRequest, LlamaCppClient, Message}; + use log::{debug, info}; + use mysql::{Mysql, Sort}; + use std::env::var; + + if var("RUST_LOG").is_ok() { + use tracing_subscriber::{EnvFilter, fmt::*}; + struct T; + impl time::FormatTime for T { + fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result { + write!(w, "{}", Local::now()) + } + } + fmt() + .with_timer(T) + .with_env_filter(EnvFilter::from_default_env()) + .init() + } + + let arg = Argument::parse(); + let db = Mysql::connect( + &arg.mysql_host, + arg.mysql_port, + &arg.mysql_username, + &arg.mysql_password, + &arg.mysql_database, + )?; + let llm = LlamaCppClient::new(format!( + "{}://{}:{}", + arg.llm_scheme, arg.llm_host, arg.llm_port + ))?; + + let provider_id = match db.provider_by_name(&arg.llm_model)? { + Some(p) => { + debug!( + "Use existing DB provider #{} matches model name `{}`", + p.provider_id, &arg.llm_model + ); + p.provider_id + } + None => { + let provider_id = db.insert_provider(&arg.llm_model)?; + info!( + "Provider `{}` not found in database, created new one with ID `{provider_id}`", + &arg.llm_model + ); + provider_id + } + }; + + info!("Daemon started"); + loop { + debug!("New queue begin..."); + for source in db.contents_queue_for_provider_id(provider_id, Sort::Asc, None)? { + debug!( + "Begin generating `content_id` #{} using `provider_id` #{provider_id}.", + source.content_id + ); + + let title = + llm.chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( + Message::user(format!("{}\n{}", arg.llm_message, source.title)), + )) + .await?; + + println!("{}", &title.choices[0].message.content); + + let description = + llm.chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( + Message::user(format!("{}\n{}", arg.llm_message, source.description)), + )) + .await?; + + let content_id = db.insert_content( + source.channel_item_id, + Some(provider_id), + &title.choices[0].message.content, + &description.choices[0].message.content, + )?; + + debug!( + "Created `content_id` #{content_id} using `content_id` #{} source by `provider_id` #{provider_id}.", + source.content_id + ) + } + debug!("Queue completed"); + if let Some(update) = arg.update { + debug!("Wait {update} seconds to continue..."); + std::thread::sleep(std::time::Duration::from_secs(update)) + } else { + return Ok(()); + } + } +} diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index e601545..f8c6c48 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -108,7 +108,7 @@ impl Mysql { pub fn contents_total_by_provider_id(&self, provider_id: Option) -> Result { let total: Option = self.pool.get_conn()?.exec_first( - "SELECT COUNT(*) FROM `content` WHERE `provider_id` = ?", + "SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ?", (provider_id,), )?; Ok(total.unwrap_or(0)) @@ -125,12 +125,35 @@ impl Mysql { `channel_item_id`, `provider_id`, `title`, - `description` FROM `content` WHERE `provider_id` = ? ORDER BY `content_id` {sort} LIMIT {}", + `description` FROM `content` WHERE `provider_id` <=> ? ORDER BY `content_id` {sort} LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) ), (provider_id, )) } + /// Get subjects for `rssto-llm` queue + pub fn contents_queue_for_provider_id( + &self, + provider_id: u64, + sort: Sort, + limit: Option, + ) -> Result, Error> { + self.pool.get_conn()?.exec( + format!( + "SELECT `c1`.`content_id`, + `c1`.`channel_item_id`, + `c1`.`provider_id`, + `c1`.`title`, + `c1`.`description` + FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS ( + SELECT NULL FROM `content` AS `c2` WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id` AND `c2`.`provider_id` = ? LIMIT 1 + ) ORDER BY `c1`.`content_id` {sort} LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) + ), + (provider_id,), + ) + } + pub fn contents_by_channel_item_id_provider_id( &self, channel_item_id: u64, @@ -143,7 +166,7 @@ impl Mysql { `channel_item_id`, `provider_id`, `title`, - `description` FROM `content` WHERE `channel_item_id` = ? AND `provider_id` = ? LIMIT {}", + `description` FROM `content` WHERE `channel_item_id` = ? AND `provider_id` <=> ? LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) ), (channel_item_id, provider_id), @@ -154,8 +177,8 @@ impl Mysql { &self, channel_item_id: u64, provider_id: Option, - title: String, - description: String, + title: &str, + description: &str, ) -> Result { let mut c = self.pool.get_conn()?; c.exec_drop( @@ -164,6 +187,21 @@ impl Mysql { )?; Ok(c.last_insert_id()) } + + pub fn provider_by_name(&self, name: &str) -> Result, Error> { + self.pool.get_conn()?.exec_first( + "SELECT `provider_id`, + `name` + FROM `provider` WHERE `name` = ?", + (name,), + ) + } + + pub fn insert_provider(&self, name: &str) -> Result { + let mut c = self.pool.get_conn()?; + c.exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?; + Ok(c.last_insert_id()) + } } #[derive(Debug, PartialEq, Eq, FromRow)] @@ -194,6 +232,12 @@ pub struct Content { pub description: String, } +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct Provider { + pub provider_id: u64, + pub name: String, +} + pub enum Sort { Asc, Desc, From 013990c9f45bfb756789ac2f4a5cd799805cbdb1 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 14:05:39 +0200 Subject: [PATCH 43/65] remove debug row --- crates/llm/src/main.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/crates/llm/src/main.rs b/crates/llm/src/main.rs index 8510253..4caf4e4 100644 --- a/crates/llm/src/main.rs +++ b/crates/llm/src/main.rs @@ -72,8 +72,6 @@ async fn main() -> Result<()> { )) .await?; - println!("{}", &title.choices[0].message.content); - let description = llm.chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( Message::user(format!("{}\n{}", arg.llm_message, source.description)), From 54ed430eddb4acd96564644ff18369e3b3dabc00 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 14:15:32 +0200 Subject: [PATCH 44/65] remove extra quote --- crates/llm/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/llm/README.md b/crates/llm/README.md index 901fb66..ecb64a8 100644 --- a/crates/llm/README.md +++ b/crates/llm/README.md @@ -10,7 +10,7 @@ LLM daemon for the rssto DB translations 2. Run LLM server: ``` -llama-server -hf ggml-org/gemma-3-1b-it-GGUF` +llama-server -hf ggml-org/gemma-3-1b-it-GGUF ``` 3. Launch `rssto-llm` to handle `content` DB: From 62b8711ce9fb7f97b75fa920082ffc8c5f782b8d Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 15:14:53 +0200 Subject: [PATCH 45/65] add readme --- crates/mysql/README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 crates/mysql/README.md diff --git a/crates/mysql/README.md b/crates/mysql/README.md new file mode 100644 index 0000000..681e048 --- /dev/null +++ b/crates/mysql/README.md @@ -0,0 +1,3 @@ +# rssto-mysql + +Shared MySQL database library From a8f6c4588721a00da96ade49f4fd5a0f88f6a45a Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 16:48:14 +0200 Subject: [PATCH 46/65] remove short option conflict --- crates/http/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/http/src/config.rs b/crates/http/src/config.rs index 56c3f41..9ca317f 100644 --- a/crates/http/src/config.rs +++ b/crates/http/src/config.rs @@ -32,7 +32,7 @@ pub struct Config { pub host: IpAddr, /// Bind server on given port - #[arg(long, short, default_value_t = 8000)] + #[arg(long, default_value_t = 8000)] pub port: u16, /// Configure instance in the debug mode From a0ba99274697f9704966d9664a00d58b8b84248e Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 16:48:38 +0200 Subject: [PATCH 47/65] add adaptive css header --- crates/http/templates/layout.html.tera | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/http/templates/layout.html.tera b/crates/http/templates/layout.html.tera index a722e34..d10ca68 100644 --- a/crates/http/templates/layout.html.tera +++ b/crates/http/templates/layout.html.tera @@ -6,6 +6,9 @@ {% if meta.description %} {% endif %} +
From b5dd30dafbe1c41572e8b5f272ba3ea62ccf0365 Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 8 Jan 2026 17:10:56 +0200 Subject: [PATCH 48/65] implement search keyword handler --- crates/http/src/main.rs | 9 +++++---- crates/mysql/src/lib.rs | 20 +++++++++++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 1e3302d..0c04491 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -33,7 +33,7 @@ fn index( title: String, } let total = db - .contents_total_by_provider_id(global.provider_id) + .contents_total_by_provider_id(global.provider_id, search) .map_err(|e| { error!("Could not get contents total: `{e}`"); Status::InternalServerError @@ -65,7 +65,7 @@ fn index( back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))), next: if page.unwrap_or(1) * global.list_limit >= total { None } else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) }, - rows: db.contents_by_provider_id(global.provider_id, Sort::Desc, Some(global.list_limit)).map_err(|e| { + rows: db.contents_by_provider_id(global.provider_id, search, Sort::Desc, Some(global.list_limit)).map_err(|e| { error!("Could not get contents: `{e}`"); Status::InternalServerError })? @@ -118,8 +118,9 @@ fn info( } } -#[get("/rss")] +#[get("/rss?")] fn rss( + search: Option<&str>, global: &State, meta: &State, db: &State, @@ -130,7 +131,7 @@ fn rss( 1024, // @TODO ); for c in db - .contents_by_provider_id(global.provider_id, Sort::Desc, Some(20)) // @TODO + .contents_by_provider_id(global.provider_id, search, Sort::Desc, Some(20)) // @TODO .map_err(|e| { error!("Could not load channel item contents: `{e}`"); Status::InternalServerError diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index f8c6c48..f3e168f 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -106,10 +106,14 @@ impl Mysql { ) } - pub fn contents_total_by_provider_id(&self, provider_id: Option) -> Result { + pub fn contents_total_by_provider_id( + &self, + provider_id: Option, + keyword: Option<&str>, + ) -> Result { let total: Option = self.pool.get_conn()?.exec_first( - "SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ?", - (provider_id,), + "SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?", + (provider_id, like(keyword)), )?; Ok(total.unwrap_or(0)) } @@ -117,6 +121,7 @@ impl Mysql { pub fn contents_by_provider_id( &self, provider_id: Option, + keyword: Option<&str>, sort: Sort, limit: Option, ) -> Result, Error> { @@ -125,10 +130,10 @@ impl Mysql { `channel_item_id`, `provider_id`, `title`, - `description` FROM `content` WHERE `provider_id` <=> ? ORDER BY `content_id` {sort} LIMIT {}", + `description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) ), - (provider_id, )) + (provider_id, like(keyword), )) } /// Get subjects for `rssto-llm` queue @@ -252,4 +257,9 @@ impl std::fmt::Display for Sort { } } +/// Shared search logic +fn like(value: Option<&str>) -> String { + value.map_or("%".into(), |k| format!("{k}%")) +} + const DEFAULT_LIMIT: usize = 100; From 221b43e4cf71985b918d112bfa71c204fd63d71e Mon Sep 17 00:00:00 2001 From: yggverse Date: Fri, 9 Jan 2026 18:27:27 +0200 Subject: [PATCH 49/65] implement image persistence db features, minor corrections --- crates/mysql/database/0.1.0.sql | 39 ++++++++++++++++- crates/mysql/src/lib.rs | 78 ++++++++++++++++++++++++++++++--- 2 files changed, 110 insertions(+), 7 deletions(-) diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql index 5eb70ff..6c318f2 100644 --- a/crates/mysql/database/0.1.0.sql +++ b/crates/mysql/database/0.1.0.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- Thu 08 Jan 2026 12:40:45 AM EET +-- пт, 09-січ-2026 17:57:03 +0200 -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -65,7 +65,7 @@ ENGINE = InnoDB; -- Table `rssto`.`content` -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`content` ( - `content_id` BIGINT NOT NULL AUTO_INCREMENT, + `content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `channel_item_id` INT NOT NULL, `provider_id` INT NULL, `title` VARCHAR(255) NOT NULL, @@ -87,6 +87,41 @@ CREATE TABLE IF NOT EXISTS `rssto`.`content` ( ENGINE = InnoDB; +-- ----------------------------------------------------- +-- Table `rssto`.`image` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`image` ( + `image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `source` VARCHAR(2048) NOT NULL, + `data` MEDIUMBLOB NOT NULL, + PRIMARY KEY (`image_id`), + UNIQUE INDEX `source_UNIQUE` (`source` ASC) VISIBLE) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`content_image` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`content_image` ( + `content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `content_id` BIGINT UNSIGNED NOT NULL, + `image_id` BIGINT UNSIGNED NOT NULL, + PRIMARY KEY (`content_image_id`), + INDEX `fk_content_image_content_idx` (`content_id` ASC) VISIBLE, + INDEX `fk_content_image_image_idx` (`image_id` ASC) VISIBLE, + CONSTRAINT `fk_content_image_content` + FOREIGN KEY (`content_id`) + REFERENCES `rssto`.`content` (`content_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_content_image_image` + FOREIGN KEY (`image_id`) + REFERENCES `rssto`.`image` (`image_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + SET SQL_MODE=@OLD_SQL_MODE; SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index f3e168f..a0cbdb4 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -23,13 +23,12 @@ impl Mysql { } pub fn channels_by_url(&self, url: &str, limit: Option) -> Result, Error> { - self.pool.get_conn()?.exec_map( + self.pool.get_conn()?.exec( format!( "SELECT `channel_id`, `url` FROM `channel` WHERE `url` = ? LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) ), (url,), - |(channel_id, url)| Channel { channel_id, url }, ) } @@ -171,7 +170,8 @@ impl Mysql { `channel_item_id`, `provider_id`, `title`, - `description` FROM `content` WHERE `channel_item_id` = ? AND `provider_id` <=> ? LIMIT {}", + `description` FROM `content` + WHERE `channel_item_id` = ? AND `provider_id` <=> ? LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) ), (channel_item_id, provider_id), @@ -187,8 +187,58 @@ impl Mysql { ) -> Result { let mut c = self.pool.get_conn()?; c.exec_drop( - "INSERT INTO `content` SET `channel_item_id` = ?, `provider_id` = ?, `title` = ?, `description` = ?", - (channel_item_id, provider_id, title, description ), + "INSERT INTO `content` SET `channel_item_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", + (channel_item_id, provider_id, title, description), + )?; + Ok(c.last_insert_id()) + } + + pub fn content_image(&self, content_image_id: u64) -> Result, Error> { + self.pool.get_conn()?.exec_first( + "SELECT `content_image_id`, + `content_id`, + `image_id`, + `data`, + `source` FROM `content_image` + JOIN `image` ON (`image`.`image_id` = `content_image`.`image_id`) + WHERE `content_image_id` = ? LIMIT 1", + (content_image_id,), + ) + } + + pub fn insert_content_image(&self, content_id: u64, image_id: u64) -> Result { + let mut c = self.pool.get_conn()?; + c.exec_drop( + "INSERT INTO `content_image` SET `content_id` = ?, `image_id` = ?", + (content_id, image_id), + )?; + Ok(c.last_insert_id()) + } + + pub fn image_by_source(&self, source: &str) -> Result, Error> { + self.pool.get_conn()?.exec_first( + "SELECT `image_id`, + `source`, + `data` FROM `image` WHERE `source` = ? LIMIT 1", + (source,), + ) + } + + pub fn images(&self, limit: Option) -> Result, Error> { + self.pool.get_conn()?.query(format!( + "SELECT `image_id`, `source`, `data` FROM `image` LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) + )) + } + + pub fn insert_image(&self, source: &str, data: &[u8]) -> Result { + let mut c = self.pool.get_conn()?; + c.exec_drop( + "INSERT INTO `image` SET `source` = ?, `data` = ?", + (source, data), )?; Ok(c.last_insert_id()) } @@ -243,6 +293,24 @@ pub struct Provider { pub name: String, } +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct Image { + pub image_id: u64, + pub source: String, + pub data: Vec, +} + +/// Includes joined `image` table members +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ContentImage { + pub content_image_id: u64, + pub content_id: u64, + pub image_id: u64, + // Image members (JOIN) + pub data: Vec, + pub source: String, +} + pub enum Sort { Asc, Desc, From f48e256fadd82cdf7a643bf4bf1f30feba9e33a0 Mon Sep 17 00:00:00 2001 From: yggverse Date: Fri, 9 Jan 2026 22:35:06 +0200 Subject: [PATCH 50/65] separate Pollable and Transactional features, separate table members, use single-connection transactions method in the crawler and llm crates, minor crawler optimizations such as disconnect from db server on each queue iteration complete --- crates/crawler/Cargo.toml | 2 +- crates/crawler/src/main.rs | 44 ++-- crates/http/src/main.rs | 10 +- crates/llm/Cargo.toml | 2 +- crates/llm/src/main.rs | 116 +++++----- crates/mysql/Cargo.toml | 7 +- crates/mysql/src/lib.rs | 338 +----------------------------- crates/mysql/src/pollable.rs | 114 ++++++++++ crates/mysql/src/pollable/sort.rs | 13 ++ crates/mysql/src/table.rs | 53 +++++ crates/mysql/src/transactional.rs | 148 +++++++++++++ 11 files changed, 438 insertions(+), 409 deletions(-) create mode 100644 crates/mysql/src/pollable.rs create mode 100644 crates/mysql/src/pollable/sort.rs create mode 100644 crates/mysql/src/table.rs create mode 100644 crates/mysql/src/transactional.rs diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml index 11d6062..1de8c3d 100644 --- a/crates/crawler/Cargo.toml +++ b/crates/crawler/Cargo.toml @@ -14,7 +14,7 @@ anyhow = "1.0.100" chrono = "0.4.42" clap = { version = "4.5.54", features = ["derive"] } log = "0.4.29" -mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" } +mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transactional"], path = "../mysql" } reqwest = { version = "0.13.1", features = ["blocking"] } rss = "2.0.12" scraper = { version = "0.25.0", features = ["serde"] } diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 9156f0f..25c8279 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -3,7 +3,7 @@ mod config; use anyhow::Result; use log::{debug, info, warn}; -use mysql::Mysql; +use mysql::Transactional; use reqwest::blocking::get; fn main() -> Result<()> { @@ -28,22 +28,27 @@ fn main() -> Result<()> { let argument = Argument::parse(); let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?; - let database = Mysql::connect( - &config.mysql.host, - config.mysql.port, - &config.mysql.user, - &config.mysql.password, - &config.mysql.database, - )?; info!("Crawler started"); loop { debug!("Begin new crawl queue..."); - for c in &config.channel { - debug!("Update `{}`...", c.url); - if let Err(e) = crawl(&database, c) { - warn!("Channel `{}` update failed: `{e}`", c.url) + { + // disconnect from the database immediately when exiting this scope, + // in case the `update` queue is enabled and pending for a while. + let mut db = Transactional::connect( + &config.mysql.host, + config.mysql.port, + &config.mysql.user, + &config.mysql.password, + &config.mysql.database, + )?; + for c in &config.channel { + debug!("Update `{}`...", c.url); + if let Err(e) = crawl(&mut db, c) { + warn!("Channel `{}` update failed: `{e}`", c.url) + } } + db.commit()? } debug!("Crawl queue completed"); if let Some(update) = config.update { @@ -55,7 +60,7 @@ fn main() -> Result<()> { } } -fn crawl(db: &Mysql, channel_config: &config::Channel) -> Result<()> { +fn crawl(db: &mut Transactional, channel_config: &config::Channel) -> Result<()> { use rss::Channel; use scraper::Selector; @@ -82,8 +87,8 @@ fn crawl(db: &Mysql, channel_config: &config::Channel) -> Result<()> { let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); - let channel_id = match db.channels_by_url(&channel_url, Some(1))?.first() { - Some(result) => result.channel_id, + let channel_id = match db.channel_id_by_url(&channel_url)? { + Some(channel_id) => channel_id, None => db.insert_channel(&channel_url)?, }; @@ -115,10 +120,7 @@ fn crawl(db: &Mysql, channel_config: &config::Channel) -> Result<()> { continue; } }; - if !db - .channel_items_by_channel_id_guid(channel_id, guid, Some(1))? - .is_empty() - { + if db.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 { continue; // skip next steps as processed } let channel_item_id = db.insert_channel_item( @@ -186,10 +188,6 @@ fn crawl(db: &Mysql, channel_config: &config::Channel) -> Result<()> { } }, }; - assert!( - db.contents_by_channel_item_id_provider_id(channel_item_id, None, Some(1))? - .is_empty() - ); let _content_id = db.insert_content(channel_item_id, None, &title, &description)?; // @TODO preload media } diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 0c04491..3964aea 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -11,7 +11,7 @@ use config::Config; use feed::Feed; use global::Global; use meta::Meta; -use mysql::{Mysql, Sort}; +use mysql::{Pollable, pollable::Sort}; use rocket::{State, http::Status, response::content::RawXml, serde::Serialize}; use rocket_dyn_templates::{Template, context}; @@ -19,7 +19,7 @@ use rocket_dyn_templates::{Template, context}; fn index( search: Option<&str>, page: Option, - db: &State, + db: &State, meta: &State, global: &State, ) -> Result { @@ -92,7 +92,7 @@ fn index( #[get("/")] fn info( content_id: u64, - db: &State, + db: &State, meta: &State, global: &State, ) -> Result { @@ -123,7 +123,7 @@ fn rss( search: Option<&str>, global: &State, meta: &State, - db: &State, + db: &State, ) -> Result, Status> { let mut f = Feed::new( &meta.title, @@ -165,7 +165,7 @@ fn rocket() -> _ { } }) .manage( - Mysql::connect( + Pollable::connect( &config.mysql_host, config.mysql_port, &config.mysql_username, diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index 7bc1b53..a5fa968 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -15,6 +15,6 @@ chrono = "0.4.42" clap = { version = "4.5.54", features = ["derive"] } lancor = "0.1.1" log = "0.4.29" -mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" } +mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transactional"], path = "../mysql" } tokio = { version = "1.0", features = ["full"] } tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } \ No newline at end of file diff --git a/crates/llm/src/main.rs b/crates/llm/src/main.rs index 4caf4e4..dbcfd59 100644 --- a/crates/llm/src/main.rs +++ b/crates/llm/src/main.rs @@ -1,15 +1,16 @@ mod argument; use anyhow::Result; +use argument::Argument; +use mysql::Transactional; #[tokio::main] async fn main() -> Result<()> { - use argument::Argument; use chrono::Local; use clap::Parser; use lancor::{ChatCompletionRequest, LlamaCppClient, Message}; use log::{debug, info}; - use mysql::{Mysql, Sort}; + use std::env::var; if var("RUST_LOG").is_ok() { @@ -27,68 +28,73 @@ async fn main() -> Result<()> { } let arg = Argument::parse(); - let db = Mysql::connect( - &arg.mysql_host, - arg.mysql_port, - &arg.mysql_username, - &arg.mysql_password, - &arg.mysql_database, - )?; let llm = LlamaCppClient::new(format!( "{}://{}:{}", arg.llm_scheme, arg.llm_host, arg.llm_port ))?; - let provider_id = match db.provider_by_name(&arg.llm_model)? { - Some(p) => { - debug!( - "Use existing DB provider #{} matches model name `{}`", - p.provider_id, &arg.llm_model - ); - p.provider_id - } - None => { - let provider_id = db.insert_provider(&arg.llm_model)?; - info!( - "Provider `{}` not found in database, created new one with ID `{provider_id}`", - &arg.llm_model - ); - provider_id + // find existing ID by model name or create a new one + // * this feature should be moved to a separate CLI tool + let provider_id = { + let mut db = tx(&arg)?; + match db.provider_id_by_name(&arg.llm_model)? { + Some(provider_id) => { + debug!( + "Use existing DB provider #{} matches model name `{}`", + provider_id, &arg.llm_model + ); + provider_id + } + None => { + let provider_id = db.insert_provider(&arg.llm_model)?; + info!( + "Provider `{}` not found in database, created new one with ID `{provider_id}`", + &arg.llm_model + ); + db.commit()?; + provider_id + } } }; info!("Daemon started"); loop { debug!("New queue begin..."); - for source in db.contents_queue_for_provider_id(provider_id, Sort::Asc, None)? { - debug!( - "Begin generating `content_id` #{} using `provider_id` #{provider_id}.", - source.content_id - ); + { + // disconnect from the database immediately when exiting this scope, + // in case the `update` queue is enabled and pending for a while. + let mut db = tx(&arg)?; + for source in db.contents_queue_for_provider_id(provider_id)? { + debug!( + "Begin generating `content_id` #{} using `provider_id` #{provider_id}.", + source.content_id + ); - let title = - llm.chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( - Message::user(format!("{}\n{}", arg.llm_message, source.title)), - )) - .await?; + let title = llm + .chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( + Message::user(format!("{}\n{}", arg.llm_message, source.title)), + )) + .await?; - let description = - llm.chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( - Message::user(format!("{}\n{}", arg.llm_message, source.description)), - )) - .await?; + let description = llm + .chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( + Message::user(format!("{}\n{}", arg.llm_message, source.description)), + )) + .await?; - let content_id = db.insert_content( - source.channel_item_id, - Some(provider_id), - &title.choices[0].message.content, - &description.choices[0].message.content, - )?; + let content_id = db.insert_content( + source.channel_item_id, + Some(provider_id), + &title.choices[0].message.content, + &description.choices[0].message.content, + )?; - debug!( - "Created `content_id` #{content_id} using `content_id` #{} source by `provider_id` #{provider_id}.", - source.content_id - ) + debug!( + "Created `content_id` #{content_id} using `content_id` #{} source by `provider_id` #{provider_id}.", + source.content_id + ) + } + db.commit()? } debug!("Queue completed"); if let Some(update) = arg.update { @@ -99,3 +105,15 @@ async fn main() -> Result<()> { } } } + +// in fact, there is no need for a transaction at this moment, +// as there are no related table updates, but who knows what the future holds +fn tx(arg: &Argument) -> Result { + Ok(Transactional::connect( + &arg.mysql_host, + arg.mysql_port, + &arg.mysql_username, + &arg.mysql_password, + &arg.mysql_database, + )?) +} diff --git a/crates/mysql/Cargo.toml b/crates/mysql/Cargo.toml index ddce0cc..7aeb4af 100644 --- a/crates/mysql/Cargo.toml +++ b/crates/mysql/Cargo.toml @@ -9,5 +9,10 @@ keywords = ["rssto", "database", "mysql", "library", "driver", "api"] # categories = [] repository = "https://github.com/YGGverse/rssto" +[features] +default = ["pollable"] +pollable = [] +transactional = [] + [dependencies] -mysql = "26.0.1" +mysql = "26.0.1" \ No newline at end of file diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index a0cbdb4..c316798 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -1,333 +1,13 @@ -use mysql::{ - Error, Pool, - prelude::{FromRow, Queryable}, -}; +#[cfg(feature = "pollable")] +pub mod pollable; -pub struct Mysql { - pool: Pool, -} +pub mod table; -impl Mysql { - pub fn connect( - host: &str, - port: u16, - user: &str, - password: &str, - database: &str, - ) -> Result { - Ok(Self { - pool: mysql::Pool::new( - format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(), - )?, - }) - } +#[cfg(feature = "transactional")] +pub mod transactional; - pub fn channels_by_url(&self, url: &str, limit: Option) -> Result, Error> { - self.pool.get_conn()?.exec( - format!( - "SELECT `channel_id`, `url` FROM `channel` WHERE `url` = ? LIMIT {}", - limit.unwrap_or(DEFAULT_LIMIT) - ), - (url,), - ) - } +#[cfg(feature = "pollable")] +pub use pollable::Pollable; - pub fn insert_channel(&self, url: &str) -> Result { - let mut c = self.pool.get_conn()?; - c.exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?; - Ok(c.last_insert_id()) - } - - pub fn channel_item(&self, channel_item_id: u64) -> Result, Error> { - self.pool.get_conn()?.exec_first( - "SELECT `channel_item_id`, - `channel_id`, - `pub_date`, - `guid`, - `link`, - `title`, - `description` FROM `channel_item` WHERE `channel_item_id` = ?", - (channel_item_id,), - ) - } - - pub fn channel_items_by_channel_id_guid( - &self, - channel_id: u64, - guid: &str, - limit: Option, - ) -> Result, Error> { - self.pool.get_conn()?.exec( - format!( - "SELECT `channel_item_id`, - `channel_id`, - `pub_date`, - `guid`, - `link`, - `title`, - `description` FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ? LIMIT {}", - limit.unwrap_or(DEFAULT_LIMIT) - ), - (channel_id, guid), - ) - } - - pub fn insert_channel_item( - &self, - channel_id: u64, - pub_date: i64, - guid: &str, - link: &str, - title: Option<&str>, - description: Option<&str>, - ) -> Result { - let mut c = self.pool.get_conn()?; - c.exec_drop( - "INSERT INTO `channel_item` SET `channel_id` = ?, - `pub_date` = ?, - `guid` = ?, - `link` = ?, - `title` = ?, - `description` = ?", - (channel_id, pub_date, guid, link, title, description), - )?; - Ok(c.last_insert_id()) - } - - pub fn content(&self, content_id: u64) -> Result, Error> { - self.pool.get_conn()?.exec_first( - "SELECT `content_id`, - `channel_item_id`, - `provider_id`, - `title`, - `description` FROM `content` WHERE `content_id` = ?", - (content_id,), - ) - } - - pub fn contents_total_by_provider_id( - &self, - provider_id: Option, - keyword: Option<&str>, - ) -> Result { - let total: Option = self.pool.get_conn()?.exec_first( - "SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?", - (provider_id, like(keyword)), - )?; - Ok(total.unwrap_or(0)) - } - - pub fn contents_by_provider_id( - &self, - provider_id: Option, - keyword: Option<&str>, - sort: Sort, - limit: Option, - ) -> Result, Error> { - self.pool.get_conn()?.exec(format!( - "SELECT `content_id`, - `channel_item_id`, - `provider_id`, - `title`, - `description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {}", - limit.unwrap_or(DEFAULT_LIMIT) - ), - (provider_id, like(keyword), )) - } - - /// Get subjects for `rssto-llm` queue - pub fn contents_queue_for_provider_id( - &self, - provider_id: u64, - sort: Sort, - limit: Option, - ) -> Result, Error> { - self.pool.get_conn()?.exec( - format!( - "SELECT `c1`.`content_id`, - `c1`.`channel_item_id`, - `c1`.`provider_id`, - `c1`.`title`, - `c1`.`description` - FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS ( - SELECT NULL FROM `content` AS `c2` WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id` AND `c2`.`provider_id` = ? LIMIT 1 - ) ORDER BY `c1`.`content_id` {sort} LIMIT {}", - limit.unwrap_or(DEFAULT_LIMIT) - ), - (provider_id,), - ) - } - - pub fn contents_by_channel_item_id_provider_id( - &self, - channel_item_id: u64, - provider_id: Option, - limit: Option, - ) -> Result, Error> { - self.pool.get_conn()?.exec( - format!( - "SELECT `content_id`, - `channel_item_id`, - `provider_id`, - `title`, - `description` FROM `content` - WHERE `channel_item_id` = ? AND `provider_id` <=> ? LIMIT {}", - limit.unwrap_or(DEFAULT_LIMIT) - ), - (channel_item_id, provider_id), - ) - } - - pub fn insert_content( - &self, - channel_item_id: u64, - provider_id: Option, - title: &str, - description: &str, - ) -> Result { - let mut c = self.pool.get_conn()?; - c.exec_drop( - "INSERT INTO `content` SET `channel_item_id` = ?, - `provider_id` = ?, - `title` = ?, - `description` = ?", - (channel_item_id, provider_id, title, description), - )?; - Ok(c.last_insert_id()) - } - - pub fn content_image(&self, content_image_id: u64) -> Result, Error> { - self.pool.get_conn()?.exec_first( - "SELECT `content_image_id`, - `content_id`, - `image_id`, - `data`, - `source` FROM `content_image` - JOIN `image` ON (`image`.`image_id` = `content_image`.`image_id`) - WHERE `content_image_id` = ? LIMIT 1", - (content_image_id,), - ) - } - - pub fn insert_content_image(&self, content_id: u64, image_id: u64) -> Result { - let mut c = self.pool.get_conn()?; - c.exec_drop( - "INSERT INTO `content_image` SET `content_id` = ?, `image_id` = ?", - (content_id, image_id), - )?; - Ok(c.last_insert_id()) - } - - pub fn image_by_source(&self, source: &str) -> Result, Error> { - self.pool.get_conn()?.exec_first( - "SELECT `image_id`, - `source`, - `data` FROM `image` WHERE `source` = ? LIMIT 1", - (source,), - ) - } - - pub fn images(&self, limit: Option) -> Result, Error> { - self.pool.get_conn()?.query(format!( - "SELECT `image_id`, `source`, `data` FROM `image` LIMIT {}", - limit.unwrap_or(DEFAULT_LIMIT) - )) - } - - pub fn insert_image(&self, source: &str, data: &[u8]) -> Result { - let mut c = self.pool.get_conn()?; - c.exec_drop( - "INSERT INTO `image` SET `source` = ?, `data` = ?", - (source, data), - )?; - Ok(c.last_insert_id()) - } - - pub fn provider_by_name(&self, name: &str) -> Result, Error> { - self.pool.get_conn()?.exec_first( - "SELECT `provider_id`, - `name` - FROM `provider` WHERE `name` = ?", - (name,), - ) - } - - pub fn insert_provider(&self, name: &str) -> Result { - let mut c = self.pool.get_conn()?; - c.exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?; - Ok(c.last_insert_id()) - } -} - -#[derive(Debug, PartialEq, Eq, FromRow)] -pub struct Channel { - pub channel_id: u64, - pub url: String, -} - -#[derive(Debug, PartialEq, Eq, FromRow)] -pub struct ChannelItem { - pub channel_item_id: u64, - pub channel_id: u64, - pub pub_date: i64, - pub guid: String, - pub link: String, - pub title: Option, - pub description: Option, -} - -#[derive(Debug, PartialEq, Eq, FromRow)] -pub struct Content { - pub content_id: u64, - pub channel_item_id: u64, - /// None if the original `title` and `description` values - /// parsed from the channel item on crawl - pub provider_id: Option, - pub title: String, - pub description: String, -} - -#[derive(Debug, PartialEq, Eq, FromRow)] -pub struct Provider { - pub provider_id: u64, - pub name: String, -} - -#[derive(Debug, PartialEq, Eq, FromRow)] -pub struct Image { - pub image_id: u64, - pub source: String, - pub data: Vec, -} - -/// Includes joined `image` table members -#[derive(Debug, PartialEq, Eq, FromRow)] -pub struct ContentImage { - pub content_image_id: u64, - pub content_id: u64, - pub image_id: u64, - // Image members (JOIN) - pub data: Vec, - pub source: String, -} - -pub enum Sort { - Asc, - Desc, -} - -impl std::fmt::Display for Sort { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - match self { - Self::Asc => write!(f, "ASC"), - Self::Desc => write!(f, "DESC"), - } - } -} - -/// Shared search logic -fn like(value: Option<&str>) -> String { - value.map_or("%".into(), |k| format!("{k}%")) -} - -const DEFAULT_LIMIT: usize = 100; +#[cfg(feature = "transactional")] +pub use transactional::Transactional; diff --git a/crates/mysql/src/pollable.rs b/crates/mysql/src/pollable.rs new file mode 100644 index 0000000..474c427 --- /dev/null +++ b/crates/mysql/src/pollable.rs @@ -0,0 +1,114 @@ +pub mod sort; + +pub use sort::Sort; + +use crate::table::*; +use mysql::{Error, Pool, prelude::Queryable}; + +/// Safe, read-only operations used in client apps like `rssto-http` +pub struct Pollable { + pool: Pool, +} + +impl Pollable { + pub fn connect( + host: &str, + port: u16, + user: &str, + password: &str, + database: &str, + ) -> Result { + Ok(Self { + pool: mysql::Pool::new( + format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(), + )?, + }) + } + + pub fn channel_item(&self, channel_item_id: u64) -> Result, Error> { + self.pool.get_conn()?.exec_first( + "SELECT `channel_item_id`, + `channel_id`, + `pub_date`, + `guid`, + `link`, + `title`, + `description` FROM `channel_item` WHERE `channel_item_id` = ?", + (channel_item_id,), + ) + } + + pub fn content(&self, content_id: u64) -> Result, Error> { + self.pool.get_conn()?.exec_first( + "SELECT `content_id`, + `channel_item_id`, + `provider_id`, + `title`, + `description` FROM `content` WHERE `content_id` = ?", + (content_id,), + ) + } + + pub fn contents_total_by_provider_id( + &self, + provider_id: Option, + keyword: Option<&str>, + ) -> Result { + let total: Option = self.pool.get_conn()?.exec_first( + "SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?", + (provider_id, like(keyword)), + )?; + Ok(total.unwrap_or(0)) + } + + pub fn contents_by_provider_id( + &self, + provider_id: Option, + keyword: Option<&str>, + sort: Sort, + limit: Option, + ) -> Result, Error> { + self.pool.get_conn()?.exec(format!( + "SELECT `content_id`, + `channel_item_id`, + `provider_id`, + `title`, + `description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) + ), + (provider_id, like(keyword), )) + } + + pub fn content_image(&self, content_image_id: u64) -> Result, Error> { + self.pool.get_conn()?.exec_first( + "SELECT `content_image_id`, + `content_id`, + `image_id`, + `data`, + `source` FROM `content_image` + JOIN `image` ON (`image`.`image_id` = `content_image`.`image_id`) + WHERE `content_image_id` = ? LIMIT 1", + (content_image_id,), + ) + } + + pub fn images(&self, limit: Option) -> Result, Error> { + self.pool.get_conn()?.query(format!( + "SELECT `image_id`, `source`, `data` FROM `image` LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) + )) + } + + pub fn insert_provider(&self, name: &str) -> Result { + let mut c = self.pool.get_conn()?; + c.exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?; + Ok(c.last_insert_id()) + } +} + +/// Shared search logic +fn like(value: Option<&str>) -> String { + value.map_or("%".into(), |k| format!("{k}%")) +} + +const DEFAULT_LIMIT: usize = 100; diff --git a/crates/mysql/src/pollable/sort.rs b/crates/mysql/src/pollable/sort.rs new file mode 100644 index 0000000..d8b121d --- /dev/null +++ b/crates/mysql/src/pollable/sort.rs @@ -0,0 +1,13 @@ +pub enum Sort { + Asc, + Desc, +} + +impl std::fmt::Display for Sort { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Self::Asc => write!(f, "ASC"), + Self::Desc => write!(f, "DESC"), + } + } +} diff --git a/crates/mysql/src/table.rs b/crates/mysql/src/table.rs new file mode 100644 index 0000000..5df3348 --- /dev/null +++ b/crates/mysql/src/table.rs @@ -0,0 +1,53 @@ +use mysql::prelude::FromRow; + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct Channel { + pub channel_id: u64, + pub url: String, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ChannelItem { + pub channel_item_id: u64, + pub channel_id: u64, + pub pub_date: i64, + pub guid: String, + pub link: String, + pub title: Option, + pub description: Option, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct Content { + pub content_id: u64, + pub channel_item_id: u64, + /// None if the original `title` and `description` values + /// parsed from the channel item on crawl + pub provider_id: Option, + pub title: String, + pub description: String, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct Provider { + pub provider_id: u64, + pub name: String, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct Image { + pub image_id: u64, + pub source: String, + pub data: Vec, +} + +/// Includes joined `image` table members +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ContentImage { + pub content_image_id: u64, + pub content_id: u64, + pub image_id: u64, + // Image members (JOIN) + pub data: Vec, + pub source: String, +} diff --git a/crates/mysql/src/transactional.rs b/crates/mysql/src/transactional.rs new file mode 100644 index 0000000..ce80305 --- /dev/null +++ b/crates/mysql/src/transactional.rs @@ -0,0 +1,148 @@ +use crate::table::*; +use mysql::{Error, Pool, Transaction, TxOpts, prelude::Queryable}; + +/// Safe, optimized read/write operations +/// mostly required by the `rssto-crawler` and `rssto-llm` +/// * all members implementation requires `commit` action +pub struct Transactional { + tx: Transaction<'static>, +} + +impl Transactional { + pub fn connect( + host: &str, + port: u16, + user: &str, + password: &str, + database: &str, + ) -> Result { + Ok(Self { + tx: Pool::new(format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str())? + .start_transaction(TxOpts::default())?, + }) + } + + pub fn commit(self) -> Result<(), Error> { + self.tx.commit() + } + + pub fn channel_id_by_url(&mut self, url: &str) -> Result, Error> { + self.tx.exec_first( + "SELECT `channel_id` FROM `channel` WHERE `url` = ? LIMIT 1", + (url,), + ) + } + + pub fn insert_channel(&mut self, url: &str) -> Result { + self.tx + .exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn channel_items_total_by_channel_id_guid( + &mut self, + channel_id: u64, + guid: &str, + ) -> Result { + Ok(self + .tx + .exec_first( + "SELECT COUNT(*) FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ?", + (channel_id, guid), + )? + .unwrap_or(0)) + } + + pub fn insert_channel_item( + &mut self, + channel_id: u64, + pub_date: i64, + guid: &str, + link: &str, + title: Option<&str>, + description: Option<&str>, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item` SET `channel_id` = ?, + `pub_date` = ?, + `guid` = ?, + `link` = ?, + `title` = ?, + `description` = ?", + (channel_id, pub_date, guid, link, title, description), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn contents_queue_for_provider_id( + &mut self, + provider_id: u64, + ) -> Result, Error> { + self.tx.exec( + "SELECT `c1`.`content_id`, + `c1`.`channel_item_id`, + `c1`.`provider_id`, + `c1`.`title`, + `c1`.`description` + FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS ( + SELECT NULL FROM `content` AS `c2` + WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id` + AND `c2`.`provider_id` = ? LIMIT 1 + )", + (provider_id,), + ) + } + + pub fn insert_content( + &mut self, + channel_item_id: u64, + provider_id: Option, + title: &str, + description: &str, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `content` SET `channel_item_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", + (channel_item_id, provider_id, title, description), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn insert_content_image(&mut self, content_id: u64, image_id: u64) -> Result { + self.tx.exec_drop( + "INSERT INTO `content_image` SET `content_id` = ?, `image_id` = ?", + (content_id, image_id), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn images_total_by_source(&mut self, source: &str) -> Result { + Ok(self + .tx + .exec_first("SELECT COUNT(*) FROM `image` WHERE `source` = ?", (source,))? + .unwrap_or(0)) + } + + pub fn insert_image(&mut self, source: &str, data: &[u8]) -> Result { + self.tx.exec_drop( + "INSERT INTO `image` SET `source` = ?, `data` = ?", + (source, data), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn provider_id_by_name(&mut self, name: &str) -> Result, Error> { + self.tx.exec_first( + "SELECT `provider_id` FROM `provider` WHERE `name` = ?", + (name,), + ) + } + + pub fn insert_provider(&mut self, name: &str) -> Result { + self.tx + .exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?; + Ok(self.tx.last_insert_id().unwrap()) + } +} From ee083dfc45d91a4dc92ebe7d546352f419ca6e02 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 01:41:20 +0200 Subject: [PATCH 51/65] optimize db api --- crates/crawler/Cargo.toml | 2 +- crates/crawler/src/main.rs | 46 +++++---- crates/http/src/main.rs | 97 +++++++++++++------ crates/llm/Cargo.toml | 2 +- crates/llm/src/main.rs | 88 ++++++++--------- crates/mysql/Cargo.toml | 5 +- .../mysql/src/{pollable.rs => connection.rs} | 63 ++++++------ crates/mysql/src/lib.rs | 41 ++++++-- crates/mysql/src/pollable/sort.rs | 13 --- crates/mysql/src/table.rs | 14 +++ .../src/{transactional.rs => transaction.rs} | 36 ++----- 11 files changed, 215 insertions(+), 192 deletions(-) rename crates/mysql/src/{pollable.rs => connection.rs} (63%) delete mode 100644 crates/mysql/src/pollable/sort.rs rename crates/mysql/src/{transactional.rs => transaction.rs} (81%) diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml index 1de8c3d..d531744 100644 --- a/crates/crawler/Cargo.toml +++ b/crates/crawler/Cargo.toml @@ -14,7 +14,7 @@ anyhow = "1.0.100" chrono = "0.4.42" clap = { version = "4.5.54", features = ["derive"] } log = "0.4.29" -mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transactional"], path = "../mysql" } +mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transaction"], path = "../mysql" } reqwest = { version = "0.13.1", features = ["blocking"] } rss = "2.0.12" scraper = { version = "0.25.0", features = ["serde"] } diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 25c8279..0d0867a 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -3,11 +3,9 @@ mod config; use anyhow::Result; use log::{debug, info, warn}; -use mysql::Transactional; use reqwest::blocking::get; fn main() -> Result<()> { - use argument::Argument; use chrono::Local; use clap::Parser; use std::{env::var, fs::read_to_string}; @@ -26,29 +24,29 @@ fn main() -> Result<()> { .init() } - let argument = Argument::parse(); + let argument = argument::Argument::parse(); let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?; + let db = mysql::Database::pool( + &config.mysql.host, + config.mysql.port, + &config.mysql.user, + &config.mysql.password, + &config.mysql.database, + )?; info!("Crawler started"); loop { debug!("Begin new crawl queue..."); - { - // disconnect from the database immediately when exiting this scope, - // in case the `update` queue is enabled and pending for a while. - let mut db = Transactional::connect( - &config.mysql.host, - config.mysql.port, - &config.mysql.user, - &config.mysql.password, - &config.mysql.database, - )?; - for c in &config.channel { - debug!("Update `{}`...", c.url); - if let Err(e) = crawl(&mut db, c) { - warn!("Channel `{}` update failed: `{e}`", c.url) + for c in &config.channel { + debug!("Update `{}`...", c.url); + let mut tx = db.transaction()?; + match crawl(&mut tx, c) { + Ok(()) => tx.commit()?, + Err(e) => { + warn!("Channel `{}` update failed: `{e}`", c.url); + tx.rollback()? } } - db.commit()? } debug!("Crawl queue completed"); if let Some(update) = config.update { @@ -60,7 +58,7 @@ fn main() -> Result<()> { } } -fn crawl(db: &mut Transactional, channel_config: &config::Channel) -> Result<()> { +fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> { use rss::Channel; use scraper::Selector; @@ -87,9 +85,9 @@ fn crawl(db: &mut Transactional, channel_config: &config::Channel) -> Result<()> let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); - let channel_id = match db.channel_id_by_url(&channel_url)? { + let channel_id = match tx.channel_id_by_url(&channel_url)? { Some(channel_id) => channel_id, - None => db.insert_channel(&channel_url)?, + None => tx.insert_channel(&channel_url)?, }; for channel_item in channel_items.iter().take(channel_items_limit) { @@ -120,10 +118,10 @@ fn crawl(db: &mut Transactional, channel_config: &config::Channel) -> Result<()> continue; } }; - if db.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 { + if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 { continue; // skip next steps as processed } - let channel_item_id = db.insert_channel_item( + let channel_item_id = tx.insert_channel_item( channel_id, pub_date, guid, @@ -188,7 +186,7 @@ fn crawl(db: &mut Transactional, channel_config: &config::Channel) -> Result<()> } }, }; - let _content_id = db.insert_content(channel_item_id, None, &title, &description)?; + let _content_id = tx.insert_content(channel_item_id, None, &title, &description)?; // @TODO preload media } Ok(()) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 3964aea..0803379 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -11,7 +11,7 @@ use config::Config; use feed::Feed; use global::Global; use meta::Meta; -use mysql::{Pollable, pollable::Sort}; +use mysql::{Database, table::Sort}; use rocket::{State, http::Status, response::content::RawXml, serde::Serialize}; use rocket_dyn_templates::{Template, context}; @@ -19,7 +19,7 @@ use rocket_dyn_templates::{Template, context}; fn index( search: Option<&str>, page: Option, - db: &State, + db: &State, meta: &State, global: &State, ) -> Result { @@ -32,7 +32,11 @@ fn index( time: String, title: String, } - let total = db + let mut conn = db.connection().map_err(|e| { + error!("Could not connect database: `{e}`"); + Status::InternalServerError + })?; + let total = conn .contents_total_by_provider_id(global.provider_id, search) .map_err(|e| { error!("Could not get contents total: `{e}`"); @@ -65,19 +69,24 @@ fn index( back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))), next: if page.unwrap_or(1) * global.list_limit >= total { None } else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) }, - rows: db.contents_by_provider_id(global.provider_id, search, Sort::Desc, Some(global.list_limit)).map_err(|e| { - error!("Could not get contents: `{e}`"); - Status::InternalServerError - })? + rows: conn.contents_by_provider_id( + global.provider_id, + search, + Sort::Desc, + Some(global.list_limit) + ).map_err(|e| { + error!("Could not get contents: `{e}`"); + Status::InternalServerError + })? .into_iter() - .map(|c| { - let channel_item = db.channel_item(c.channel_item_id).unwrap().unwrap(); + .map(|content| { + let channel_item = conn.channel_item(content.channel_item_id).unwrap().unwrap(); Content { - content_id: c.content_id, - description: c.description, + content_id: content.content_id, + description: content.description, link: channel_item.link, time: time(channel_item.pub_date).format(&global.format_time).to_string(), - title: c.title, + title: content.title, } }) .collect::>(), @@ -92,25 +101,38 @@ fn index( #[get("/")] fn info( content_id: u64, - db: &State, + db: &State, meta: &State, global: &State, ) -> Result { - match db.content(content_id).map_err(|e| { + let mut conn = db.connection().map_err(|e| { + error!("Could not connect database: `{e}`"); + Status::InternalServerError + })?; + match conn.content(content_id).map_err(|e| { error!("Could not get content `{content_id}`: `{e}`"); Status::InternalServerError })? { - Some(c) => { - let i = db.channel_item(c.channel_item_id).unwrap().unwrap(); + Some(content) => { + let channel_item = conn + .channel_item(content.channel_item_id) + .map_err(|e| { + error!("Could not get requested channel item: `{e}`"); + Status::InternalServerError + })? + .ok_or_else(|| { + error!("Could not find requested channel item"); + Status::NotFound + })?; Ok(Template::render( "info", context! { - description: c.description, - link: i.link, + description: content.description, + link: channel_item.link, meta: meta.inner(), - title: format!("{}{S}{}", c.title, meta.title), - name: c.title, - time: time(i.pub_date).format(&global.format_time).to_string(), + title: format!("{}{S}{}", content.title, meta.title), + name: content.title, + time: time(channel_item.pub_date).format(&global.format_time).to_string(), }, )) } @@ -123,30 +145,43 @@ fn rss( search: Option<&str>, global: &State, meta: &State, - db: &State, + db: &State, ) -> Result, Status> { - let mut f = Feed::new( + let mut feed = Feed::new( &meta.title, meta.description.as_deref(), 1024, // @TODO ); - for c in db + let mut conn = db.connection().map_err(|e| { + error!("Could not connect database: `{e}`"); + Status::InternalServerError + })?; + for content in conn .contents_by_provider_id(global.provider_id, search, Sort::Desc, Some(20)) // @TODO .map_err(|e| { error!("Could not load channel item contents: `{e}`"); Status::InternalServerError })? { - let channel_item = db.channel_item(c.channel_item_id).unwrap().unwrap(); - f.push( - c.channel_item_id, + let channel_item = conn + .channel_item(content.channel_item_id) + .map_err(|e| { + error!("Could not get requested channel item: `{e}`"); + Status::InternalServerError + })? + .ok_or_else(|| { + error!("Could not find requested channel item"); + Status::NotFound + })?; + feed.push( + content.channel_item_id, time(channel_item.pub_date), channel_item.link, - c.title, - c.description, + content.title, + content.description, ) } - Ok(RawXml(f.commit())) + Ok(RawXml(feed.commit())) } #[launch] @@ -165,7 +200,7 @@ fn rocket() -> _ { } }) .manage( - Pollable::connect( + Database::pool( &config.mysql_host, config.mysql_port, &config.mysql_username, diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index a5fa968..3e371e8 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -15,6 +15,6 @@ chrono = "0.4.42" clap = { version = "4.5.54", features = ["derive"] } lancor = "0.1.1" log = "0.4.29" -mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transactional"], path = "../mysql" } +mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transaction"], path = "../mysql" } tokio = { version = "1.0", features = ["full"] } tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } \ No newline at end of file diff --git a/crates/llm/src/main.rs b/crates/llm/src/main.rs index dbcfd59..1774e00 100644 --- a/crates/llm/src/main.rs +++ b/crates/llm/src/main.rs @@ -2,7 +2,7 @@ mod argument; use anyhow::Result; use argument::Argument; -use mysql::Transactional; +use mysql::Database; #[tokio::main] async fn main() -> Result<()> { @@ -32,12 +32,17 @@ async fn main() -> Result<()> { "{}://{}:{}", arg.llm_scheme, arg.llm_host, arg.llm_port ))?; + let db = Database::pool( + &arg.mysql_host, + arg.mysql_port, + &arg.mysql_username, + &arg.mysql_password, + &arg.mysql_database, + )?; - // find existing ID by model name or create a new one - // * this feature should be moved to a separate CLI tool let provider_id = { - let mut db = tx(&arg)?; - match db.provider_id_by_name(&arg.llm_model)? { + let mut conn = db.connection()?; + match conn.provider_id_by_name(&arg.llm_model)? { Some(provider_id) => { debug!( "Use existing DB provider #{} matches model name `{}`", @@ -46,12 +51,11 @@ async fn main() -> Result<()> { provider_id } None => { - let provider_id = db.insert_provider(&arg.llm_model)?; + let provider_id = conn.insert_provider(&arg.llm_model)?; info!( "Provider `{}` not found in database, created new one with ID `{provider_id}`", &arg.llm_model ); - db.commit()?; provider_id } } @@ -60,42 +64,38 @@ async fn main() -> Result<()> { info!("Daemon started"); loop { debug!("New queue begin..."); - { - // disconnect from the database immediately when exiting this scope, - // in case the `update` queue is enabled and pending for a while. - let mut db = tx(&arg)?; - for source in db.contents_queue_for_provider_id(provider_id)? { - debug!( - "Begin generating `content_id` #{} using `provider_id` #{provider_id}.", - source.content_id - ); + let mut tx = db.transaction()?; + for source in tx.contents_queue_for_provider_id(provider_id)? { + debug!( + "Begin generating `content_id` #{} using `provider_id` #{provider_id}.", + source.content_id + ); - let title = llm - .chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( - Message::user(format!("{}\n{}", arg.llm_message, source.title)), - )) - .await?; + let title = + llm.chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( + Message::user(format!("{}\n{}", arg.llm_message, source.title)), + )) + .await?; - let description = llm - .chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( - Message::user(format!("{}\n{}", arg.llm_message, source.description)), - )) - .await?; + let description = + llm.chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( + Message::user(format!("{}\n{}", arg.llm_message, source.description)), + )) + .await?; - let content_id = db.insert_content( - source.channel_item_id, - Some(provider_id), - &title.choices[0].message.content, - &description.choices[0].message.content, - )?; + let content_id = tx.insert_content( + source.channel_item_id, + Some(provider_id), + &title.choices[0].message.content, + &description.choices[0].message.content, + )?; - debug!( - "Created `content_id` #{content_id} using `content_id` #{} source by `provider_id` #{provider_id}.", - source.content_id - ) - } - db.commit()? + debug!( + "Created `content_id` #{content_id} using `content_id` #{} source by `provider_id` #{provider_id}.", + source.content_id + ) } + tx.commit()?; debug!("Queue completed"); if let Some(update) = arg.update { debug!("Wait {update} seconds to continue..."); @@ -105,15 +105,3 @@ async fn main() -> Result<()> { } } } - -// in fact, there is no need for a transaction at this moment, -// as there are no related table updates, but who knows what the future holds -fn tx(arg: &Argument) -> Result { - Ok(Transactional::connect( - &arg.mysql_host, - arg.mysql_port, - &arg.mysql_username, - &arg.mysql_password, - &arg.mysql_database, - )?) -} diff --git a/crates/mysql/Cargo.toml b/crates/mysql/Cargo.toml index 7aeb4af..253d787 100644 --- a/crates/mysql/Cargo.toml +++ b/crates/mysql/Cargo.toml @@ -10,9 +10,8 @@ keywords = ["rssto", "database", "mysql", "library", "driver", "api"] repository = "https://github.com/YGGverse/rssto" [features] -default = ["pollable"] -pollable = [] -transactional = [] +default = [] +transaction = [] [dependencies] mysql = "26.0.1" \ No newline at end of file diff --git a/crates/mysql/src/pollable.rs b/crates/mysql/src/connection.rs similarity index 63% rename from crates/mysql/src/pollable.rs rename to crates/mysql/src/connection.rs index 474c427..c59e2df 100644 --- a/crates/mysql/src/pollable.rs +++ b/crates/mysql/src/connection.rs @@ -1,32 +1,20 @@ -pub mod sort; - -pub use sort::Sort; - use crate::table::*; -use mysql::{Error, Pool, prelude::Queryable}; +use mysql::{Error, Pool, PooledConn, prelude::Queryable}; /// Safe, read-only operations used in client apps like `rssto-http` -pub struct Pollable { - pool: Pool, +pub struct Connection { + conn: PooledConn, } -impl Pollable { - pub fn connect( - host: &str, - port: u16, - user: &str, - password: &str, - database: &str, - ) -> Result { +impl Connection { + pub fn create(pool: &Pool) -> Result { Ok(Self { - pool: mysql::Pool::new( - format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(), - )?, + conn: pool.get_conn()?, }) } - pub fn channel_item(&self, channel_item_id: u64) -> Result, Error> { - self.pool.get_conn()?.exec_first( + pub fn channel_item(&mut self, channel_item_id: u64) -> Result, Error> { + self.conn.exec_first( "SELECT `channel_item_id`, `channel_id`, `pub_date`, @@ -38,8 +26,8 @@ impl Pollable { ) } - pub fn content(&self, content_id: u64) -> Result, Error> { - self.pool.get_conn()?.exec_first( + pub fn content(&mut self, content_id: u64) -> Result, Error> { + self.conn.exec_first( "SELECT `content_id`, `channel_item_id`, `provider_id`, @@ -50,11 +38,11 @@ impl Pollable { } pub fn contents_total_by_provider_id( - &self, + &mut self, provider_id: Option, keyword: Option<&str>, ) -> Result { - let total: Option = self.pool.get_conn()?.exec_first( + let total: Option = self.conn.exec_first( "SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?", (provider_id, like(keyword)), )?; @@ -62,13 +50,13 @@ impl Pollable { } pub fn contents_by_provider_id( - &self, + &mut self, provider_id: Option, keyword: Option<&str>, sort: Sort, limit: Option, ) -> Result, Error> { - self.pool.get_conn()?.exec(format!( + self.conn.exec(format!( "SELECT `content_id`, `channel_item_id`, `provider_id`, @@ -79,8 +67,8 @@ impl Pollable { (provider_id, like(keyword), )) } - pub fn content_image(&self, content_image_id: u64) -> Result, Error> { - self.pool.get_conn()?.exec_first( + pub fn content_image(&mut self, content_image_id: u64) -> Result, Error> { + self.conn.exec_first( "SELECT `content_image_id`, `content_id`, `image_id`, @@ -92,17 +80,24 @@ impl Pollable { ) } - pub fn images(&self, limit: Option) -> Result, Error> { - self.pool.get_conn()?.query(format!( + pub fn images(&mut self, limit: Option) -> Result, Error> { + self.conn.query(format!( "SELECT `image_id`, `source`, `data` FROM `image` LIMIT {}", limit.unwrap_or(DEFAULT_LIMIT) )) } - pub fn insert_provider(&self, name: &str) -> Result { - let mut c = self.pool.get_conn()?; - c.exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?; - Ok(c.last_insert_id()) + pub fn provider_id_by_name(&mut self, name: &str) -> Result, Error> { + self.conn.exec_first( + "SELECT `provider_id` FROM `provider` WHERE `name` = ?", + (name,), + ) + } + + pub fn insert_provider(&mut self, name: &str) -> Result { + self.conn + .exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?; + Ok(self.conn.last_insert_id()) } } diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index c316798..53ef7d6 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -1,13 +1,36 @@ -#[cfg(feature = "pollable")] -pub mod pollable; - +mod connection; pub mod table; +#[cfg(feature = "transaction")] +mod transaction; -#[cfg(feature = "transactional")] -pub mod transactional; +pub use connection::Connection; +#[cfg(feature = "transaction")] +pub use transaction::Transaction; +pub struct Database { + pool: mysql::Pool, +} -#[cfg(feature = "pollable")] -pub use pollable::Pollable; +impl Database { + pub fn pool( + host: &str, + port: u16, + user: &str, + password: &str, + database: &str, + ) -> Result { + Ok(Self { + pool: mysql::Pool::new( + format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(), + )?, + }) + } -#[cfg(feature = "transactional")] -pub use transactional::Transactional; + pub fn connection(&self) -> Result { + Connection::create(&self.pool) + } + + #[cfg(feature = "transaction")] + pub fn transaction(&self) -> Result { + Transaction::create(&self.pool) + } +} diff --git a/crates/mysql/src/pollable/sort.rs b/crates/mysql/src/pollable/sort.rs deleted file mode 100644 index d8b121d..0000000 --- a/crates/mysql/src/pollable/sort.rs +++ /dev/null @@ -1,13 +0,0 @@ -pub enum Sort { - Asc, - Desc, -} - -impl std::fmt::Display for Sort { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - match self { - Self::Asc => write!(f, "ASC"), - Self::Desc => write!(f, "DESC"), - } - } -} diff --git a/crates/mysql/src/table.rs b/crates/mysql/src/table.rs index 5df3348..3ee92ce 100644 --- a/crates/mysql/src/table.rs +++ b/crates/mysql/src/table.rs @@ -51,3 +51,17 @@ pub struct ContentImage { pub data: Vec, pub source: String, } + +pub enum Sort { + Asc, + Desc, +} + +impl std::fmt::Display for Sort { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Self::Asc => write!(f, "ASC"), + Self::Desc => write!(f, "DESC"), + } + } +} diff --git a/crates/mysql/src/transactional.rs b/crates/mysql/src/transaction.rs similarity index 81% rename from crates/mysql/src/transactional.rs rename to crates/mysql/src/transaction.rs index ce80305..a39e290 100644 --- a/crates/mysql/src/transactional.rs +++ b/crates/mysql/src/transaction.rs @@ -1,24 +1,17 @@ use crate::table::*; -use mysql::{Error, Pool, Transaction, TxOpts, prelude::Queryable}; +use mysql::{Error, Pool, TxOpts, prelude::Queryable}; /// Safe, optimized read/write operations /// mostly required by the `rssto-crawler` and `rssto-llm` /// * all members implementation requires `commit` action -pub struct Transactional { - tx: Transaction<'static>, +pub struct Transaction { + tx: mysql::Transaction<'static>, } -impl Transactional { - pub fn connect( - host: &str, - port: u16, - user: &str, - password: &str, - database: &str, - ) -> Result { +impl Transaction { + pub fn create(pool: &Pool) -> Result { Ok(Self { - tx: Pool::new(format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str())? - .start_transaction(TxOpts::default())?, + tx: pool.start_transaction(TxOpts::default())?, }) } @@ -26,6 +19,10 @@ impl Transactional { self.tx.commit() } + pub fn rollback(self) -> Result<(), Error> { + self.tx.rollback() + } + pub fn channel_id_by_url(&mut self, url: &str) -> Result, Error> { self.tx.exec_first( "SELECT `channel_id` FROM `channel` WHERE `url` = ? LIMIT 1", @@ -132,17 +129,4 @@ impl Transactional { )?; Ok(self.tx.last_insert_id().unwrap()) } - - pub fn provider_id_by_name(&mut self, name: &str) -> Result, Error> { - self.tx.exec_first( - "SELECT `provider_id` FROM `provider` WHERE `name` = ?", - (name,), - ) - } - - pub fn insert_provider(&mut self, name: &str) -> Result { - self.tx - .exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?; - Ok(self.tx.last_insert_id().unwrap()) - } } From bc61b5c09c63fe3b621f540f73977f9d6258bf06 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 11:53:22 +0200 Subject: [PATCH 52/65] move `config.toml` example into crate root --- crates/crawler/{config/example.toml => config.toml} | 0 crates/crawler/src/argument.rs | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename crates/crawler/{config/example.toml => config.toml} (100%) diff --git a/crates/crawler/config/example.toml b/crates/crawler/config.toml similarity index 100% rename from crates/crawler/config/example.toml rename to crates/crawler/config.toml diff --git a/crates/crawler/src/argument.rs b/crates/crawler/src/argument.rs index 5443edd..3894dd5 100644 --- a/crates/crawler/src/argument.rs +++ b/crates/crawler/src/argument.rs @@ -6,7 +6,7 @@ use std::path::PathBuf; pub struct Argument { /// Path to config file /// - /// * see `config/example.toml` + /// * see `config.toml` #[arg(short, long)] pub config: PathBuf, } From ec0cca64f3b02b8606d17ddc72840af4e47b7e48 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 14:38:01 +0200 Subject: [PATCH 53/65] implement `persist_images_selector`, minimize codebase by using `bail`, change image table structure to use sha256 hash as the unique image identity --- crates/crawler/Cargo.toml | 1 + crates/crawler/config.toml | 4 +- crates/crawler/src/config.rs | 3 + crates/crawler/src/main.rs | 141 +++++++++++++++----------------- crates/mysql/database/0.1.0.sql | 8 +- crates/mysql/src/table.rs | 8 +- crates/mysql/src/transaction.rs | 22 +++-- 7 files changed, 97 insertions(+), 90 deletions(-) diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml index d531744..6e55b06 100644 --- a/crates/crawler/Cargo.toml +++ b/crates/crawler/Cargo.toml @@ -19,6 +19,7 @@ reqwest = { version = "0.13.1", features = ["blocking"] } rss = "2.0.12" scraper = { version = "0.25.0", features = ["serde"] } serde = { version = "1.0.228", features = ["derive"] } +sha2 = "0.10.9" toml = "0.9.10" tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } url = { version = "2.5.8", features = ["serde"] } \ No newline at end of file diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml index bde12ba..3232c16 100644 --- a/crates/crawler/config.toml +++ b/crates/crawler/config.toml @@ -19,6 +19,7 @@ persist_item_description = true # optional: # content_title_selector = "h1" # content_description_selector = "article" +# persist_images_selector = "img" [[channel]] url = "https://" @@ -27,4 +28,5 @@ persist_item_title = true persist_item_description = true # optional: # content_title_selector = "h1" -# content_description_selector = "article" \ No newline at end of file +# content_description_selector = "article" +# persist_images_selector = "img" \ No newline at end of file diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs index dc325b5..701c6e4 100644 --- a/crates/crawler/src/config.rs +++ b/crates/crawler/src/config.rs @@ -27,6 +27,9 @@ pub struct Channel { /// Scrape description by CSS selector /// * None to ignore pub content_description_selector: Option, + /// Preload content images locally if `Some` + /// * currently stored in the database + pub persist_images_selector: Option, } #[derive(Debug, Deserialize)] diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 0d0867a..0767499 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -1,9 +1,10 @@ mod argument; mod config; -use anyhow::Result; +use anyhow::{Result, bail}; use log::{debug, info, warn}; use reqwest::blocking::get; +use url::Url; fn main() -> Result<()> { use chrono::Local; @@ -59,64 +60,40 @@ fn main() -> Result<()> { } fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> { - use rss::Channel; - use scraper::Selector; - - /// local helper - fn scrape(url: &str, selector: &Selector) -> Result> { - let document = scraper::Html::parse_document(&get(url)?.text()?); - Ok(if let Some(first) = document.select(selector).next() { - Some(first.inner_html()) - } else { - warn!("Could not scrape requested inner"); - None - }) - } - let channel_url = channel_config.url.to_string(); // allocate once - let channel_items = match Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { - Ok(response) => response.into_items(), - Err(e) => { - warn!("Could not parse response from `{channel_url}`: `{e}`"); - return Ok(()); - } - }; + let channel_items = + match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { + Ok(response) => response.into_items(), + Err(e) => bail!("Could not parse response: `{e}`"), + }; let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); let channel_id = match tx.channel_id_by_url(&channel_url)? { Some(channel_id) => channel_id, - None => tx.insert_channel(&channel_url)?, + None => { + let channel_id = tx.insert_channel(&channel_url)?; + info!("Register new channel #{channel_id} ({channel_url})"); + channel_id + } }; for channel_item in channel_items.iter().take(channel_items_limit) { let guid = match channel_item.guid { Some(ref guid) => guid.value.as_ref(), - None => { - warn!("Undefined `guid` field in `{channel_url}`"); - continue; - } + None => bail!("Undefined `guid` field"), }; - let link = match channel_item.link { - Some(ref link) => link, - None => { - warn!("Undefined `link` field in `{channel_url}`"); - continue; - } + let (link, base) = match channel_item.link { + Some(ref link) => (link, Url::parse(link)?), + None => bail!("Undefined `link` field"), }; let pub_date = match channel_item.pub_date { Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) { Ok(t) => t.timestamp(), - Err(e) => { - warn!("Invalid `pub_date` field in `{channel_url}`: `{e}`"); - continue; - } + Err(e) => bail!("Invalid `pub_date` field: `{e}`"), }, - None => { - warn!("Undefined `pub_date` field in `{channel_url}`"); - continue; - } + None => bail!("Undefined `pub_date`"), }; if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 { continue; // skip next steps as processed @@ -137,57 +114,67 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul None }, )?; + info!("Register new channel item #{channel_item_id} ({link})"); // preload remote content.. + let html = scraper::Html::parse_document(&get(link)?.text()?); let title = match channel_config.content_title_selector { - Some(ref selector) => match scrape(link, selector) { - Ok(value) => match value { - Some(title) => title, - None => { - warn!("Could not scrape `title` selector in `{channel_url}`"); - continue; - } - }, - Err(e) => { - warn!("Could not update `title` selector in `{channel_url}`: `{e}`"); - continue; - } + Some(ref selector) => match html.select(selector).next() { + Some(title) => title.inner_html(), + None => bail!("Could not scrape `title` selector from `{link}`"), }, None => match channel_item.title { Some(ref title) => title.clone(), - None => { - warn!( - "Could not assign `title` from channel item for content in `{channel_url}`" - ); - continue; - } + None => bail!("Could not assign `title` from channel item for content in `{link}`"), }, }; let description = match channel_config.content_description_selector { - Some(ref selector) => match scrape(link, selector) { - Ok(value) => match value { - Some(description) => description, - None => { - warn!("Could not scrape `description` selector in `{channel_url}`"); - continue; - } - }, - Err(e) => { - warn!("Could not update `description` selector in `{channel_url}`: `{e}`"); - continue; - } + Some(ref selector) => match html.select(selector).next() { + Some(description) => description.inner_html(), + None => bail!("Could not scrape `description` selector from `{link}`"), }, None => match channel_item.description { Some(ref description) => description.clone(), None => { - warn!( - "Could not assign `description` from channel item for content in `{channel_url}`" - ); - continue; + bail!("Could not assign `description` from channel item for `{link}`") } }, }; - let _content_id = tx.insert_content(channel_item_id, None, &title, &description)?; - // @TODO preload media + let content_id = tx.insert_content(channel_item_id, None, &title, &description)?; + info!("Add new content record #{content_id} ({title})"); + // persist images if enabled + if let Some(ref selector) = channel_config.persist_images_selector { + use sha2::{Digest, Sha256}; + for element in scraper::Html::parse_document(&description).select(selector) { + if let Some(src) = element.value().attr("src") { + let absolute = match Url::parse(src) { + Ok(url) => url, + Err(e) => { + if e == url::ParseError::RelativeUrlWithoutBase { + let absolute = base.join(link)?; + debug!("Convert relative image link `{link}` to `{absolute}`"); + absolute + } else { + bail!("Could not parse URL from img source: `{e}`") + } + } + }; + let url = absolute.as_str(); + let data = get(url)?.bytes()?; + let hash = format!("{:x}", Sha256::digest(&data)); + + let image_id = match tx.image_id_by_sha256(&hash)? { + Some(image_id) => image_id, + None => { + let image_id = tx.insert_image(&hash, Some(src), Some(url), &data)?; + info!("Persist new image #{image_id} (`{absolute}`)"); + image_id + } + }; + let content_image_id = tx.insert_content_image(content_id, image_id)?; + debug!("Add content image relationship #{content_image_id}") + } + } + } } Ok(()) } diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql index 6c318f2..9524e12 100644 --- a/crates/mysql/database/0.1.0.sql +++ b/crates/mysql/database/0.1.0.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- пт, 09-січ-2026 17:57:03 +0200 +-- сб, 10-січ-2026 14:27:50 +0200 -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -92,10 +92,12 @@ ENGINE = InnoDB; -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`image` ( `image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `source` VARCHAR(2048) NOT NULL, + `sha256` CHAR(64) NOT NULL, + `src` VARCHAR(2048) NULL, + `url` VARCHAR(2048) NULL, `data` MEDIUMBLOB NOT NULL, PRIMARY KEY (`image_id`), - UNIQUE INDEX `source_UNIQUE` (`source` ASC) VISIBLE) + UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE) ENGINE = InnoDB; diff --git a/crates/mysql/src/table.rs b/crates/mysql/src/table.rs index 3ee92ce..631bc37 100644 --- a/crates/mysql/src/table.rs +++ b/crates/mysql/src/table.rs @@ -37,7 +37,13 @@ pub struct Provider { #[derive(Debug, PartialEq, Eq, FromRow)] pub struct Image { pub image_id: u64, - pub source: String, + /// Keep image unique by comparing its data hash + pub sha256: String, + /// Original `src` tag value to post-replacing + pub src: Option, + /// Resolved absolute URL + pub url: Option, + /// Image data, MEDIUMBLOB (16M) pub data: Vec, } diff --git a/crates/mysql/src/transaction.rs b/crates/mysql/src/transaction.rs index a39e290..919b56b 100644 --- a/crates/mysql/src/transaction.rs +++ b/crates/mysql/src/transaction.rs @@ -115,17 +115,23 @@ impl Transaction { Ok(self.tx.last_insert_id().unwrap()) } - pub fn images_total_by_source(&mut self, source: &str) -> Result { - Ok(self - .tx - .exec_first("SELECT COUNT(*) FROM `image` WHERE `source` = ?", (source,))? - .unwrap_or(0)) + pub fn image_id_by_sha256(&mut self, sha256: &str) -> Result, Error> { + self.tx.exec_first( + "SELECT `image_id` FROM `image` WHERE `sha256` = ? LIMIT 1", + (sha256,), + ) } - pub fn insert_image(&mut self, source: &str, data: &[u8]) -> Result { + pub fn insert_image( + &mut self, + sha256: &str, + src: Option<&str>, + url: Option<&str>, + data: &[u8], + ) -> Result { self.tx.exec_drop( - "INSERT INTO `image` SET `source` = ?, `data` = ?", - (source, data), + "INSERT INTO `image` SET `sha256` = ?, `src` = ?, `url` = ?, `data` = ?", + (sha256, src, url, data), )?; Ok(self.tx.last_insert_id().unwrap()) } From 3e94399ccbf04fd3e366fec7445657a4e15b386e Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 16:37:28 +0200 Subject: [PATCH 54/65] use config file instead of argument options --- crates/crawler/config.toml | 2 +- crates/crawler/src/config.rs | 2 +- crates/crawler/src/main.rs | 2 +- crates/http/Cargo.toml | 2 + crates/http/README.md | 7 +--- crates/http/config.toml | 33 +++++++++++++++++ crates/http/src/argument.rs | 12 ++++++ crates/http/src/config.rs | 71 +++++++++++------------------------- crates/http/src/main.rs | 16 ++++---- crates/llm/Cargo.toml | 4 +- crates/llm/README.md | 8 +--- crates/llm/config.toml | 22 +++++++++++ crates/llm/src/argument.rs | 37 +++---------------- crates/llm/src/config.rs | 27 ++++++++++++++ crates/llm/src/main.rs | 40 ++++++++++---------- 15 files changed, 162 insertions(+), 123 deletions(-) create mode 100644 crates/http/config.toml create mode 100644 crates/http/src/argument.rs create mode 100644 crates/llm/config.toml create mode 100644 crates/llm/src/config.rs diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml index 3232c16..ad50346 100644 --- a/crates/crawler/config.toml +++ b/crates/crawler/config.toml @@ -6,7 +6,7 @@ update = 900 [mysql] host = "localhost" port = 3306 -user = "" +username = "" password = "" database = "rssto" diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs index 701c6e4..b4734cc 100644 --- a/crates/crawler/src/config.rs +++ b/crates/crawler/src/config.rs @@ -8,7 +8,7 @@ pub struct Mysql { pub host: String, pub password: String, pub port: u16, - pub user: String, + pub username: String, } #[derive(Debug, Deserialize)] diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 0767499..881b9b6 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -30,7 +30,7 @@ fn main() -> Result<()> { let db = mysql::Database::pool( &config.mysql.host, config.mysql.port, - &config.mysql.user, + &config.mysql.username, &config.mysql.password, &config.mysql.database, )?; diff --git a/crates/http/Cargo.toml b/crates/http/Cargo.toml index e2cfbf8..e5b8c8c 100644 --- a/crates/http/Cargo.toml +++ b/crates/http/Cargo.toml @@ -15,3 +15,5 @@ clap = { version = "4.5.54", features = ["derive"] } mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" } rocket = "0.5.1" rocket_dyn_templates = { version = "0.2.0", features = ["tera"] } +serde = { version = "1.0.228", features = ["derive"] } +toml = "0.9.10" \ No newline at end of file diff --git a/crates/http/README.md b/crates/http/README.md index c9edc0e..e9b1b42 100644 --- a/crates/http/README.md +++ b/crates/http/README.md @@ -7,8 +7,5 @@ Web server implementation based on the Rocket engine ``` cd rssto/crates/rssto-http -cargo run -- --mysql-username {USER} \ - --mysql-password {PASS} \ - --mysql-database {NAME} -``` -* optionally, use `--provider-id {ID}` to filter content using post-processing results (e.g. generated by the `rssto-llm` crate) \ No newline at end of file +cargo run -- -c /path/to/config.toml +``` \ No newline at end of file diff --git a/crates/http/config.toml b/crates/http/config.toml new file mode 100644 index 0000000..f4ade36 --- /dev/null +++ b/crates/http/config.toml @@ -0,0 +1,33 @@ +title = "rssto" +#description = "" + +# Replace image sources with local +# * if crawled with the `persist_images_selector` selector +local_images = true + +format_time = "%d/%m/%Y %H:%M" + +# Provider ID (`provider` table) +# * None for the original content +# provider_id = 1 + +# Default listing limit +list_limit = 20 + +# Bind server on given host +host = "127.0.0.1" + +# Bind server on given port +port = 8000 + +#Configure instance in the debug mode +debug = true + +# Database connection setup +# * see crates/mysql/database +[mysql] +host = "localhost" +port = 3306 +username = "" +password = "" +database = "rssto" \ No newline at end of file diff --git a/crates/http/src/argument.rs b/crates/http/src/argument.rs new file mode 100644 index 0000000..3894dd5 --- /dev/null +++ b/crates/http/src/argument.rs @@ -0,0 +1,12 @@ +use clap::Parser; +use std::path::PathBuf; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +pub struct Argument { + /// Path to config file + /// + /// * see `config.toml` + #[arg(short, long)] + pub config: PathBuf, +} diff --git a/crates/http/src/config.rs b/crates/http/src/config.rs index 9ca317f..a6068dd 100644 --- a/crates/http/src/config.rs +++ b/crates/http/src/config.rs @@ -1,53 +1,24 @@ -use clap::Parser; -use std::net::{IpAddr, Ipv4Addr}; +use serde::Deserialize; +use std::net::IpAddr; -#[derive(Parser, Debug)] -#[command(version, about, long_about = None)] -pub struct Config { - /// Server name - #[arg(long, default_value_t = String::from("rssto"))] - pub title: String, - - /// Server description - #[arg(long)] - pub description: Option, - - /// Format timestamps (on the web view) - /// - /// * tip: escape with `%%d/%%m/%%Y %%H:%%M` in the CLI/bash argument - #[arg(long, default_value_t = String::from("%d/%m/%Y %H:%M"))] - pub format_time: String, - - /// Provider ID (`provider` table) - /// * None for the original content - #[arg(long, short)] - pub provider_id: Option, - - /// Default listing limit - #[arg(long, default_value_t = 20)] - pub list_limit: usize, - - /// Bind server on given host - #[arg(long, default_value_t = IpAddr::V4(Ipv4Addr::LOCALHOST))] - pub host: IpAddr, - - /// Bind server on given port - #[arg(long, default_value_t = 8000)] +#[derive(Debug, Deserialize)] +pub struct Mysql { + pub database: String, + pub host: String, + pub password: String, pub port: u16, - - /// Configure instance in the debug mode - #[arg(long, default_value_t = false)] - pub debug: bool, - - // Database - #[arg(long, default_value_t = String::from("localhost"))] - pub mysql_host: String, - #[arg(long, default_value_t = 3306)] - pub mysql_port: u16, - #[arg(long)] - pub mysql_username: String, - #[arg(long)] - pub mysql_password: String, - #[arg(long)] - pub mysql_database: String, + pub username: String, +} + +#[derive(Debug, Deserialize)] +pub struct Config { + pub mysql: Mysql, + pub title: String, + pub description: Option, + pub format_time: String, + pub provider_id: Option, + pub list_limit: usize, + pub host: IpAddr, + pub port: u16, + pub debug: bool, } diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 0803379..df8ecd4 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -1,13 +1,13 @@ #[macro_use] extern crate rocket; +mod argument; mod config; mod feed; mod global; mod meta; use chrono::{DateTime, Utc}; -use config::Config; use feed::Feed; use global::Global; use meta::Meta; @@ -187,7 +187,9 @@ fn rss( #[launch] fn rocket() -> _ { use clap::Parser; - let config = Config::parse(); + let argument = argument::Argument::parse(); + let config: config::Config = + toml::from_str(&std::fs::read_to_string(argument.config).unwrap()).unwrap(); rocket::build() .attach(Template::fairing()) .configure(rocket::Config { @@ -201,11 +203,11 @@ fn rocket() -> _ { }) .manage( Database::pool( - &config.mysql_host, - config.mysql_port, - &config.mysql_username, - &config.mysql_password, - &config.mysql_database, + &config.mysql.host, + config.mysql.port, + &config.mysql.username, + &config.mysql.password, + &config.mysql.database, ) .unwrap(), ) diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index 3e371e8..6f62481 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -16,5 +16,7 @@ clap = { version = "4.5.54", features = ["derive"] } lancor = "0.1.1" log = "0.4.29" mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transaction"], path = "../mysql" } +serde = { version = "1.0.228", features = ["derive"] } tokio = { version = "1.0", features = ["full"] } -tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } \ No newline at end of file +toml = "0.9.10" +tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } diff --git a/crates/llm/README.md b/crates/llm/README.md index ecb64a8..16cb13d 100644 --- a/crates/llm/README.md +++ b/crates/llm/README.md @@ -17,12 +17,6 @@ llama-server -hf ggml-org/gemma-3-1b-it-GGUF ``` cd rssto/crates/rssto-llm -cargo run -- --mysql-username {USER} \ - --mysql-password {PASS} \ - --mysql-database {NAME} \ - --llm-host {HOST} \ - --llm-port {PORT} \ - --llm-model {MODEL} \ - --llm-message {MESSAGE} +cargo run -- -c /path/to/config.toml ``` * see `--help` to display all supported options \ No newline at end of file diff --git a/crates/llm/config.toml b/crates/llm/config.toml new file mode 100644 index 0000000..eb9f926 --- /dev/null +++ b/crates/llm/config.toml @@ -0,0 +1,22 @@ +# Rescan database for new subjects, in seconds +# * process once if not defined +# update = 900 + +# Database connection setup +# * see crates/mysql/database +[mysql] +host = "localhost" +port = 3306 +username = "" +password = "" +database = "rssto" + +# LLM connection setup +[llm] +scheme = "http" +host = "127.0.0.1" +port = 8080 +# Model name +model = "ggml-org/gemma-3-1b-it-GGUF" +# Initial message for the `content` subject (e.g. `translate to...`) +message = "translate to english:" \ No newline at end of file diff --git a/crates/llm/src/argument.rs b/crates/llm/src/argument.rs index a3cc51d..3894dd5 100644 --- a/crates/llm/src/argument.rs +++ b/crates/llm/src/argument.rs @@ -1,37 +1,12 @@ use clap::Parser; +use std::path::PathBuf; #[derive(Parser, Debug)] #[command(version, about, long_about = None)] pub struct Argument { - // LLM - #[arg(long, default_value_t = String::from("http"))] - pub llm_scheme: String, - #[arg(long, default_value_t = String::from("localhost"))] - pub llm_host: String, - #[arg(long, default_value_t = 8080)] - pub llm_port: u16, - - /// Model name (e.g. `INSAIT-Institute/MamayLM-Gemma-2-9B-IT-v0.1-GGUF` or `ggml-org/gemma-3-1b-it-GGUF`) - #[arg(long)] - pub llm_model: String, - - /// Initial message for the `content` subject (e.g. `translate to...`) - #[arg(long)] - pub llm_message: String, - - // Database - #[arg(long, default_value_t = String::from("localhost"))] - pub mysql_host: String, - #[arg(long, default_value_t = 3306)] - pub mysql_port: u16, - #[arg(long)] - pub mysql_username: String, - #[arg(long)] - pub mysql_password: String, - #[arg(long)] - pub mysql_database: String, - /// Loop update in seconds - /// * None to exit on complete - #[arg(long, short)] - pub update: Option, + /// Path to config file + /// + /// * see `config.toml` + #[arg(short, long)] + pub config: PathBuf, } diff --git a/crates/llm/src/config.rs b/crates/llm/src/config.rs new file mode 100644 index 0000000..9655ea6 --- /dev/null +++ b/crates/llm/src/config.rs @@ -0,0 +1,27 @@ +use serde::Deserialize; +use std::net::IpAddr; + +#[derive(Debug, Deserialize)] +pub struct Mysql { + pub database: String, + pub host: IpAddr, + pub password: String, + pub port: u16, + pub username: String, +} + +#[derive(Debug, Deserialize)] +pub struct Llm { + pub scheme: String, + pub host: IpAddr, + pub port: u16, + pub model: String, + pub message: String, +} + +#[derive(Debug, Deserialize)] +pub struct Config { + pub mysql: Mysql, + pub llm: Llm, + pub update: Option, +} diff --git a/crates/llm/src/main.rs b/crates/llm/src/main.rs index 1774e00..188902f 100644 --- a/crates/llm/src/main.rs +++ b/crates/llm/src/main.rs @@ -1,7 +1,7 @@ mod argument; +mod config; use anyhow::Result; -use argument::Argument; use mysql::Database; #[tokio::main] @@ -27,34 +27,36 @@ async fn main() -> Result<()> { .init() } - let arg = Argument::parse(); + let argument = argument::Argument::parse(); + let config: config::Config = toml::from_str(&std::fs::read_to_string(argument.config)?)?; + let llm = LlamaCppClient::new(format!( "{}://{}:{}", - arg.llm_scheme, arg.llm_host, arg.llm_port + config.llm.scheme, config.llm.host, config.llm.port ))?; let db = Database::pool( - &arg.mysql_host, - arg.mysql_port, - &arg.mysql_username, - &arg.mysql_password, - &arg.mysql_database, + &config.mysql.host.to_string(), + config.mysql.port, + &config.mysql.username, + &config.mysql.password, + &config.mysql.database, )?; let provider_id = { let mut conn = db.connection()?; - match conn.provider_id_by_name(&arg.llm_model)? { + match conn.provider_id_by_name(&config.llm.model)? { Some(provider_id) => { debug!( "Use existing DB provider #{} matches model name `{}`", - provider_id, &arg.llm_model + provider_id, &config.llm.model ); provider_id } None => { - let provider_id = conn.insert_provider(&arg.llm_model)?; + let provider_id = conn.insert_provider(&config.llm.model)?; info!( "Provider `{}` not found in database, created new one with ID `{provider_id}`", - &arg.llm_model + &config.llm.model ); provider_id } @@ -71,15 +73,15 @@ async fn main() -> Result<()> { source.content_id ); - let title = - llm.chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( - Message::user(format!("{}\n{}", arg.llm_message, source.title)), + let title = llm + .chat_completion(ChatCompletionRequest::new(&config.llm.model).message( + Message::user(format!("{}\n{}", config.llm.message, source.title)), )) .await?; - let description = - llm.chat_completion(ChatCompletionRequest::new(&arg.llm_model).message( - Message::user(format!("{}\n{}", arg.llm_message, source.description)), + let description = llm + .chat_completion(ChatCompletionRequest::new(&config.llm.model).message( + Message::user(format!("{}\n{}", config.llm.message, source.description)), )) .await?; @@ -97,7 +99,7 @@ async fn main() -> Result<()> { } tx.commit()?; debug!("Queue completed"); - if let Some(update) = arg.update { + if let Some(update) = config.update { debug!("Wait {update} seconds to continue..."); std::thread::sleep(std::time::Duration::from_secs(update)) } else { From e86b241ee6423743c1b5d6c70484d3f538386a85 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 17:53:48 +0200 Subject: [PATCH 55/65] implement local image features --- crates/crawler/src/main.rs | 5 ++++- crates/http/config.toml | 4 ---- crates/http/src/main.rs | 32 +++++++++++++++++++++------ crates/http/templates/index.html.tera | 5 +---- crates/mysql/src/connection.rs | 14 +++++++----- crates/mysql/src/transaction.rs | 13 +++++++++++ 6 files changed, 52 insertions(+), 21 deletions(-) diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 881b9b6..c766142 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -171,7 +171,10 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul } }; let content_image_id = tx.insert_content_image(content_id, image_id)?; - debug!("Add content image relationship #{content_image_id}") + debug!("Add content image relationship #{content_image_id}"); + let uri = format!("/image/{image_id}"); + tx.replace_content_description(content_id, src, &uri)?; + debug!("Replace content image in description from `{src}` to `{uri}`") } } } diff --git a/crates/http/config.toml b/crates/http/config.toml index f4ade36..0e81ccb 100644 --- a/crates/http/config.toml +++ b/crates/http/config.toml @@ -1,10 +1,6 @@ title = "rssto" #description = "" -# Replace image sources with local -# * if crawled with the `persist_images_selector` selector -local_images = true - format_time = "%d/%m/%Y %H:%M" # Provider ID (`provider` table) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index df8ecd4..568b95b 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -12,7 +12,12 @@ use feed::Feed; use global::Global; use meta::Meta; use mysql::{Database, table::Sort}; -use rocket::{State, http::Status, response::content::RawXml, serde::Serialize}; +use rocket::{ + State, + http::{ContentType, Status}, + response::content::RawXml, + serde::Serialize, +}; use rocket_dyn_templates::{Template, context}; #[get("/?&")] @@ -25,9 +30,8 @@ fn index( ) -> Result { #[derive(Serialize)] #[serde(crate = "rocket::serde")] - struct Content { + struct Row { content_id: u64, - description: String, link: String, time: String, title: String, @@ -81,15 +85,14 @@ fn index( .into_iter() .map(|content| { let channel_item = conn.channel_item(content.channel_item_id).unwrap().unwrap(); - Content { + Row { content_id: content.content_id, - description: content.description, link: channel_item.link, time: time(channel_item.pub_date).format(&global.format_time).to_string(), title: content.title, } }) - .collect::>(), + .collect::>(), page: page.unwrap_or(1), pages: (total as f64 / global.list_limit as f64).ceil(), total, @@ -140,6 +143,21 @@ fn info( } } +#[get("/image/")] +fn image(image_id: u64, db: &State) -> Result<(ContentType, Vec), Status> { + let mut conn = db.connection().map_err(|e| { + error!("Could not connect database: `{e}`"); + Status::InternalServerError + })?; + match conn.image(image_id).map_err(|e| { + error!("Could not get content image `{image_id}`: `{e}`"); + Status::InternalServerError + })? { + Some(image) => Ok((ContentType::Bytes, image.data)), + None => Err(Status::NotFound), + } +} + #[get("/rss?")] fn rss( search: Option<&str>, @@ -221,7 +239,7 @@ fn rocket() -> _ { title: config.title, version: env!("CARGO_PKG_VERSION").into(), }) - .mount("/", routes![index, rss, info]) + .mount("/", routes![index, rss, info, image]) } const S: &str = " • "; diff --git a/crates/http/templates/index.html.tera b/crates/http/templates/index.html.tera index d695261..1e87e54 100644 --- a/crates/http/templates/index.html.tera +++ b/crates/http/templates/index.html.tera @@ -5,10 +5,7 @@

{{ row.title }}

- {% if row.time %}

{{ row.time }}

{% endif %} -
- {{ row.description | safe }} -
+ {{ row.time }}
{% endfor %} {% else %} diff --git a/crates/mysql/src/connection.rs b/crates/mysql/src/connection.rs index c59e2df..6b6bda6 100644 --- a/crates/mysql/src/connection.rs +++ b/crates/mysql/src/connection.rs @@ -80,11 +80,15 @@ impl Connection { ) } - pub fn images(&mut self, limit: Option) -> Result, Error> { - self.conn.query(format!( - "SELECT `image_id`, `source`, `data` FROM `image` LIMIT {}", - limit.unwrap_or(DEFAULT_LIMIT) - )) + pub fn image(&mut self, image_id: u64) -> Result, Error> { + self.conn.exec_first( + "SELECT `image_id`, + `sha256`, + `src`, + `url`, + `data` FROM `image` WHERE `image_id` = ?", + (image_id,), + ) } pub fn provider_id_by_name(&mut self, name: &str) -> Result, Error> { diff --git a/crates/mysql/src/transaction.rs b/crates/mysql/src/transaction.rs index 919b56b..82618dc 100644 --- a/crates/mysql/src/transaction.rs +++ b/crates/mysql/src/transaction.rs @@ -107,6 +107,19 @@ impl Transaction { Ok(self.tx.last_insert_id().unwrap()) } + pub fn replace_content_description( + &mut self, + content_id: u64, + from: &str, + to: &str, + ) -> Result<(), Error> { + self.tx.exec_drop( + "UPDATE `content` SET `description` = REPLACE(`description`, ?, ?) + WHERE`content_id` = ?", + (from, to, content_id), + ) + } + pub fn insert_content_image(&mut self, content_id: u64, image_id: u64) -> Result { self.tx.exec_drop( "INSERT INTO `content_image` SET `content_id` = ?, `image_id` = ?", From 2463446fcd226bf91fc1d043f89801c07b8e2e8d Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 18:05:26 +0200 Subject: [PATCH 56/65] set default capacity --- crates/http/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 568b95b..aa5e4e6 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -50,7 +50,7 @@ fn index( "index", context! { title: { - let mut t = String::new(); + let mut t = String::with_capacity(9); if let Some(q) = search && !q.is_empty() { t.push_str(q); t.push_str(S); From cc89dd6b9ca52f1f907ff70a7315e813977698a4 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 18:12:01 +0200 Subject: [PATCH 57/65] fix pagination offset --- crates/http/src/main.rs | 9 ++++++++- crates/mysql/src/connection.rs | 4 +++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index aa5e4e6..89fe477 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -77,6 +77,7 @@ fn index( global.provider_id, search, Sort::Desc, + page.map(|p| p - 1 * global.list_limit), Some(global.list_limit) ).map_err(|e| { error!("Could not get contents: `{e}`"); @@ -175,7 +176,13 @@ fn rss( Status::InternalServerError })?; for content in conn - .contents_by_provider_id(global.provider_id, search, Sort::Desc, Some(20)) // @TODO + .contents_by_provider_id( + global.provider_id, + search, + Sort::Desc, + None, + Some(global.list_limit), + ) .map_err(|e| { error!("Could not load channel item contents: `{e}`"); Status::InternalServerError diff --git a/crates/mysql/src/connection.rs b/crates/mysql/src/connection.rs index 6b6bda6..d9617d1 100644 --- a/crates/mysql/src/connection.rs +++ b/crates/mysql/src/connection.rs @@ -54,6 +54,7 @@ impl Connection { provider_id: Option, keyword: Option<&str>, sort: Sort, + start: Option, limit: Option, ) -> Result, Error> { self.conn.exec(format!( @@ -61,7 +62,8 @@ impl Connection { `channel_item_id`, `provider_id`, `title`, - `description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {}", + `description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {},{}", + start.unwrap_or(0), limit.unwrap_or(DEFAULT_LIMIT) ), (provider_id, like(keyword), )) From 843352bff28558e889ea08f141b58e5e712d6814 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 21:21:09 +0200 Subject: [PATCH 58/65] fix math priority with potential unsigned value issues --- crates/http/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 89fe477..978387b 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -77,7 +77,7 @@ fn index( global.provider_id, search, Sort::Desc, - page.map(|p| p - 1 * global.list_limit), + page.map(|p| if p > 1 { p - 1 } else { 1 } * global.list_limit), Some(global.list_limit) ).map_err(|e| { error!("Could not get contents: `{e}`"); From 55700495880ccc64361a7be61e9dd960f2124ca8 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 21:25:46 +0200 Subject: [PATCH 59/65] implement html sanitizing --- crates/crawler/Cargo.toml | 1 + crates/crawler/src/main.rs | 48 ++++++++++++++++++++++----------- crates/mysql/src/transaction.rs | 4 +-- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml index 6e55b06..f6275eb 100644 --- a/crates/crawler/Cargo.toml +++ b/crates/crawler/Cargo.toml @@ -10,6 +10,7 @@ categories = ["command-line-utilities", "parsing", "text-processing", "value-for repository = "https://github.com/YGGverse/rssto" [dependencies] +ammonia = "4.1.2" anyhow = "1.0.100" chrono = "0.4.42" clap = { version = "4.5.54", features = ["derive"] } diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index c766142..8d53155 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -60,6 +60,15 @@ fn main() -> Result<()> { } fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> { + use ammonia::clean; + + fn strip_tags(html: &str) -> String { + ammonia::Builder::new() + .tags(std::collections::HashSet::new()) + .clean(html) + .to_string() + } + let channel_url = channel_config.url.to_string(); // allocate once let channel_items = @@ -104,12 +113,12 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul guid, link, if channel_config.persist_item_title { - channel_item.title() + channel_item.title().map(strip_tags) } else { None }, if channel_config.persist_item_description { - channel_item.description() + channel_item.description().map(clean) } else { None }, @@ -117,17 +126,7 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul info!("Register new channel item #{channel_item_id} ({link})"); // preload remote content.. let html = scraper::Html::parse_document(&get(link)?.text()?); - let title = match channel_config.content_title_selector { - Some(ref selector) => match html.select(selector).next() { - Some(title) => title.inner_html(), - None => bail!("Could not scrape `title` selector from `{link}`"), - }, - None => match channel_item.title { - Some(ref title) => title.clone(), - None => bail!("Could not assign `title` from channel item for content in `{link}`"), - }, - }; - let description = match channel_config.content_description_selector { + let description = clean(&match channel_config.content_description_selector { Some(ref selector) => match html.select(selector).next() { Some(description) => description.inner_html(), None => bail!("Could not scrape `description` selector from `{link}`"), @@ -138,9 +137,26 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul bail!("Could not assign `description` from channel item for `{link}`") } }, - }; - let content_id = tx.insert_content(channel_item_id, None, &title, &description)?; - info!("Add new content record #{content_id} ({title})"); + }); + let content_id = tx.insert_content( + channel_item_id, + None, + strip_tags(&match channel_config.content_title_selector { + Some(ref selector) => match html.select(selector).next() { + Some(title) => title.inner_html(), + None => bail!("Could not scrape `title` selector from `{link}`"), + }, + None => match channel_item.title { + Some(ref title) => title.clone(), + None => { + bail!("Could not assign `title` from channel item for content in `{link}`") + } + }, + }) + .trim(), + clean(&description).trim(), + )?; + info!("Add new content record #{content_id}"); // persist images if enabled if let Some(ref selector) = channel_config.persist_images_selector { use sha2::{Digest, Sha256}; diff --git a/crates/mysql/src/transaction.rs b/crates/mysql/src/transaction.rs index 82618dc..c2a2077 100644 --- a/crates/mysql/src/transaction.rs +++ b/crates/mysql/src/transaction.rs @@ -56,8 +56,8 @@ impl Transaction { pub_date: i64, guid: &str, link: &str, - title: Option<&str>, - description: Option<&str>, + title: Option, + description: Option, ) -> Result { self.tx.exec_drop( "INSERT INTO `channel_item` SET `channel_id` = ?, From 89cd7cb9cf498f6e10870bc6415a444f491b3ea0 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 22:47:46 +0200 Subject: [PATCH 60/65] minor template corrections --- crates/http/templates/index.html.tera | 6 ++---- crates/http/templates/info.html.tera | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/crates/http/templates/index.html.tera b/crates/http/templates/index.html.tera index 1e87e54..65082f9 100644 --- a/crates/http/templates/index.html.tera +++ b/crates/http/templates/index.html.tera @@ -5,7 +5,7 @@

{{ row.title }}

- {{ row.time }} +

{{ row.time }}

{% endfor %} {% else %} @@ -14,8 +14,6 @@ {% if next %}Next{% endif %} {% if back %}Back{% endif %} {% if total %} - - Page {{ page }} / {{ pages }} ({{ total }} total) - +

Page {{ page }} / {{ pages }} ({{ total }} total)

{% endif %} {% endblock content %} \ No newline at end of file diff --git a/crates/http/templates/info.html.tera b/crates/http/templates/info.html.tera index deeaddf..fd74623 100644 --- a/crates/http/templates/info.html.tera +++ b/crates/http/templates/info.html.tera @@ -2,7 +2,7 @@ {% block content %}

{{ name }}

- {{ time }} +

{{ time }}

{{ description | safe }}
From b34d7cdcdd79fcf6997ef6c04f93c43c4fb36d60 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 22:48:45 +0200 Subject: [PATCH 61/65] implement `allowed_tags` config option, format config, update documentation comments --- crates/crawler/config.toml | 84 +++++++++++++++++++++++++++--------- crates/crawler/src/config.rs | 13 +++--- crates/crawler/src/main.rs | 67 ++++++++++++++++------------ 3 files changed, 110 insertions(+), 54 deletions(-) diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml index ad50346..50f5a7e 100644 --- a/crates/crawler/config.toml +++ b/crates/crawler/config.toml @@ -4,29 +4,71 @@ update = 900 # Database connection setup # * see crates/mysql/database [mysql] -host = "localhost" -port = 3306 -username = "" -password = "" -database = "rssto" + + host = "localhost" + port = 3306 + username = "" + password = "" + database = "rssto" # Content sources (unlimited) [[channel]] -url = "https://" -items_limit = 20 -persist_item_title = true -persist_item_description = true -# optional: -# content_title_selector = "h1" -# content_description_selector = "article" -# persist_images_selector = "img" + + # RSS feed source + url = "https://1" + + # Limit latest channel items to crawl (unlimited by default) + items_limit = 20 + + # Save Channel item title in the database (currently not in use) + persist_item_title = true + + #Save Channel item description in the database (currently not in use) + persist_item_description = true + + # Allowed tags + # * empty to strip all tags (default) + allowed_tags = [] + + # Scrape title by CSS selector + # * None to use Channel item title if exists or fail to continue + # content_title_selector = "h1" + + # Scrape description by CSS selector + # * None to use Channel item description if exists or fail to continue + # content_description_selector = "article" + + # Preload content images locally if `Some` + # * currently stored in the database + # persist_images_selector = "img" + [[channel]] -url = "https://" -items_limit = 20 -persist_item_title = true -persist_item_description = true -# optional: -# content_title_selector = "h1" -# content_description_selector = "article" -# persist_images_selector = "img" \ No newline at end of file + + # RSS feed source + url = "https://2" + + # Limit latest channel items to crawl (unlimited by default) + items_limit = 20 + + # Save Channel item title in the database (currently not in use) + persist_item_title = true + + #Save Channel item description in the database (currently not in use) + persist_item_description = true + + # Allowed tags + # * empty to strip all tags (default) + allowed_tags = [] + + # Scrape title by CSS selector + # * None to use Channel item title if exists or fail to continue + # content_title_selector = "h1" + + # Scrape description by CSS selector + # * None to use Channel item description if exists or fail to continue + # content_description_selector = "article" + + # Preload content images locally if `Some` + # * currently stored in the database + # persist_images_selector = "img" diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs index b4734cc..cf2881b 100644 --- a/crates/crawler/src/config.rs +++ b/crates/crawler/src/config.rs @@ -15,18 +15,21 @@ pub struct Mysql { pub struct Channel { /// RSS feed source pub url: Url, - /// Limit channel items (unlimited by default) + /// Limit latest channel items to crawl (unlimited by default) pub items_limit: Option, - /// Save item title + /// Save Channel item title in the database (currently not in use) pub persist_item_title: bool, - /// Save item description + /// Save Channel item description in the database (currently not in use) pub persist_item_description: bool, /// Scrape title by CSS selector - /// * None to ignore + /// * None to use Channel item title if exists or fail to continue pub content_title_selector: Option, /// Scrape description by CSS selector - /// * None to ignore + /// * None to use Channel item description if exists or fail to continue pub content_description_selector: Option, + /// Allowed tags + /// * empty to strip all tags (default) + pub allowed_tags: std::collections::HashSet, /// Preload content images locally if `Some` /// * currently stored in the database pub persist_images_selector: Option, diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 8d53155..110092b 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -60,11 +60,12 @@ fn main() -> Result<()> { } fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> { - use ammonia::clean; + use std::collections::HashSet; - fn strip_tags(html: &str) -> String { + /// Removes all tags from `html` excluding `allowed_tags` or all if None + fn strip_tags(html: &str, allowed_tags: Option<&HashSet>) -> String { ammonia::Builder::new() - .tags(std::collections::HashSet::new()) + .tags(allowed_tags.map_or(HashSet::new(), |a| a.iter().map(|t| t.as_str()).collect())) .clean(html) .to_string() } @@ -113,12 +114,14 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul guid, link, if channel_config.persist_item_title { - channel_item.title().map(strip_tags) + channel_item.title().map(|s| strip_tags(s, None)) } else { None }, if channel_config.persist_item_description { - channel_item.description().map(clean) + channel_item + .description() + .map(|s| strip_tags(s, Some(&channel_config.allowed_tags))) } else { None }, @@ -126,35 +129,43 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul info!("Register new channel item #{channel_item_id} ({link})"); // preload remote content.. let html = scraper::Html::parse_document(&get(link)?.text()?); - let description = clean(&match channel_config.content_description_selector { - Some(ref selector) => match html.select(selector).next() { - Some(description) => description.inner_html(), - None => bail!("Could not scrape `description` selector from `{link}`"), + let description = strip_tags( + &match channel_config.content_description_selector { + Some(ref selector) => match html.select(selector).next() { + Some(description) => description.inner_html(), + None => bail!("Could not scrape `description` selector from `{link}`"), + }, + None => match channel_item.description { + Some(ref description) => description.clone(), + None => { + bail!("Could not assign `description` from channel item for `{link}`") + } + }, }, - None => match channel_item.description { - Some(ref description) => description.clone(), - None => { - bail!("Could not assign `description` from channel item for `{link}`") - } - }, - }); + Some(&channel_config.allowed_tags), + ); let content_id = tx.insert_content( channel_item_id, None, - strip_tags(&match channel_config.content_title_selector { - Some(ref selector) => match html.select(selector).next() { - Some(title) => title.inner_html(), - None => bail!("Could not scrape `title` selector from `{link}`"), + strip_tags( + &match channel_config.content_title_selector { + Some(ref selector) => match html.select(selector).next() { + Some(title) => title.inner_html(), + None => bail!("Could not scrape `title` selector from `{link}`"), + }, + None => match channel_item.title { + Some(ref title) => title.clone(), + None => { + bail!( + "Could not assign `title` from channel item for content in `{link}`" + ) + } + }, }, - None => match channel_item.title { - Some(ref title) => title.clone(), - None => { - bail!("Could not assign `title` from channel item for content in `{link}`") - } - }, - }) + None, + ) .trim(), - clean(&description).trim(), + description.trim(), )?; info!("Add new content record #{content_id}"); // persist images if enabled From 7e4d9e3ed6973aaf16378cb5a331b392ef4404e3 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 10 Jan 2026 22:50:44 +0200 Subject: [PATCH 62/65] remove extra keyword --- crates/mysql/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/mysql/Cargo.toml b/crates/mysql/Cargo.toml index 253d787..0a151f3 100644 --- a/crates/mysql/Cargo.toml +++ b/crates/mysql/Cargo.toml @@ -5,7 +5,7 @@ edition = "2024" license = "MIT" readme = "README.md" description = "Shared MySQL database library" -keywords = ["rssto", "database", "mysql", "library", "driver", "api"] +keywords = ["rssto", "database", "mysql", "library", "api"] # categories = [] repository = "https://github.com/YGGverse/rssto" From 2b804d8915143239ee23ae38309bc94e2b4f2523 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sun, 11 Jan 2026 20:36:00 +0200 Subject: [PATCH 63/65] normalize db tables, optionally persist channel descriptions, remove entries logic from the crawler, update config options --- crates/crawler/config.toml | 34 +++--- crates/crawler/src/config.rs | 12 ++- crates/crawler/src/main.rs | 135 ++++++++++++----------- crates/http/src/main.rs | 111 ++++++++++++++----- crates/http/templates/index.html.tera | 8 +- crates/llm/src/main.rs | 65 +++++++----- crates/mysql/database/0.1.0.sql | 147 +++++++++++++++++++------- crates/mysql/src/connection.rs | 104 ++++++++++++------ crates/mysql/src/table.rs | 24 +++-- crates/mysql/src/transaction.rs | 109 +++++++++++++------ 10 files changed, 500 insertions(+), 249 deletions(-) diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml index 50f5a7e..7c2cb2b 100644 --- a/crates/crawler/config.toml +++ b/crates/crawler/config.toml @@ -18,25 +18,28 @@ update = 900 url = "https://1" # Limit latest channel items to crawl (unlimited by default) - items_limit = 20 + items_limit = 5 - # Save Channel item title in the database (currently not in use) - persist_item_title = true + # Save Channel `title` and `description` in the database (currently not in use) + persist_description = true - #Save Channel item description in the database (currently not in use) + # Save Channel item `title` and `description` in the database persist_item_description = true # Allowed tags # * empty to strip all tags (default) - allowed_tags = [] + allowed_tags = ["a", "br", "p", "img"] + + # Grab Channel item content (from the item `link`) + scrape_item_content = false # Scrape title by CSS selector # * None to use Channel item title if exists or fail to continue - # content_title_selector = "h1" + # scrape_item_content_title_selector = "h1" # Scrape description by CSS selector # * None to use Channel item description if exists or fail to continue - # content_description_selector = "article" + # scrape_item_content_description_selector = "article" # Preload content images locally if `Some` # * currently stored in the database @@ -49,25 +52,28 @@ update = 900 url = "https://2" # Limit latest channel items to crawl (unlimited by default) - items_limit = 20 + items_limit = 5 - # Save Channel item title in the database (currently not in use) - persist_item_title = true + # Save Channel `title` and `description` in the database (currently not in use) + persist_description = true - #Save Channel item description in the database (currently not in use) + # Save Channel item `title` and `description` in the database persist_item_description = true # Allowed tags # * empty to strip all tags (default) - allowed_tags = [] + allowed_tags = ["a", "br", "p", "img"] + + # Grab Channel item content (from the item `link`) + scrape_item_content = false # Scrape title by CSS selector # * None to use Channel item title if exists or fail to continue - # content_title_selector = "h1" + # scrape_item_content_title_selector = "h1" # Scrape description by CSS selector # * None to use Channel item description if exists or fail to continue - # content_description_selector = "article" + # scrape_item_content_description_selector = "article" # Preload content images locally if `Some` # * currently stored in the database diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs index cf2881b..63fe5a5 100644 --- a/crates/crawler/src/config.rs +++ b/crates/crawler/src/config.rs @@ -17,16 +17,18 @@ pub struct Channel { pub url: Url, /// Limit latest channel items to crawl (unlimited by default) pub items_limit: Option, - /// Save Channel item title in the database (currently not in use) - pub persist_item_title: bool, - /// Save Channel item description in the database (currently not in use) + /// Save Channel title and description in the database + pub persist_description: bool, + /// Save Channel item title and description in the database pub persist_item_description: bool, + /// Grab Channel item content (from the item `link`) + pub scrape_item_content: bool, /// Scrape title by CSS selector /// * None to use Channel item title if exists or fail to continue - pub content_title_selector: Option, + pub scrape_item_content_title_selector: Option, /// Scrape description by CSS selector /// * None to use Channel item description if exists or fail to continue - pub content_description_selector: Option, + pub scrape_item_content_description_selector: Option, /// Allowed tags /// * empty to strip all tags (default) pub allowed_tags: std::collections::HashSet, diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 110092b..62ecd8d 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -72,14 +72,6 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul let channel_url = channel_config.url.to_string(); // allocate once - let channel_items = - match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { - Ok(response) => response.into_items(), - Err(e) => bail!("Could not parse response: `{e}`"), - }; - - let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); - let channel_id = match tx.channel_id_by_url(&channel_url)? { Some(channel_id) => channel_id, None => { @@ -89,6 +81,28 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul } }; + let channel_items = + match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { + Ok(channel) => { + if channel_config.persist_description { + let channel_description_id = tx.insert_channel_description( + channel_id, + None, + Some(strip_tags(channel.title(), None)), + Some(strip_tags( + channel.description(), + Some(&channel_config.allowed_tags), + )), + )?; + debug!("Save channel description #{channel_description_id}") + } + channel.into_items() + } + Err(e) => bail!("Could not parse response: `{e}`"), + }; + + let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); + for channel_item in channel_items.iter().take(channel_items_limit) { let guid = match channel_item.guid { Some(ref guid) => guid.value.as_ref(), @@ -106,72 +120,62 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul None => bail!("Undefined `pub_date`"), }; if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 { + debug!("Channel item `{guid}` already exists, skipped."); continue; // skip next steps as processed } - let channel_item_id = tx.insert_channel_item( - channel_id, - pub_date, - guid, - link, - if channel_config.persist_item_title { - channel_item.title().map(|s| strip_tags(s, None)) - } else { - None - }, - if channel_config.persist_item_description { + let channel_item_id = tx.insert_channel_item(channel_id, pub_date, guid, link)?; + info!("Register new channel item #{channel_item_id} ({link})"); + if channel_config.persist_item_description { + let channel_item_description_id = tx.insert_channel_item_description( + channel_item_id, + None, + channel_item.title().map(|s| strip_tags(s, None)), channel_item .description() - .map(|s| strip_tags(s, Some(&channel_config.allowed_tags))) - } else { - None - }, - )?; - info!("Register new channel item #{channel_item_id} ({link})"); + .map(|s| strip_tags(s, Some(&channel_config.allowed_tags))), + )?; + debug!("Save channel item description #{channel_item_description_id}") + } // preload remote content.. + if !channel_config.scrape_item_content { + continue; + } + let channel_item_content_id = tx.insert_channel_item_content(channel_item_id)?; + info!("Add new content record #{channel_item_content_id}"); + let html = scraper::Html::parse_document(&get(link)?.text()?); - let description = strip_tags( - &match channel_config.content_description_selector { - Some(ref selector) => match html.select(selector).next() { - Some(description) => description.inner_html(), - None => bail!("Could not scrape `description` selector from `{link}`"), - }, - None => match channel_item.description { - Some(ref description) => description.clone(), - None => { - bail!("Could not assign `description` from channel item for `{link}`") - } - }, + let description = match channel_config.scrape_item_content_description_selector { + Some(ref selector) => match html.select(selector).next() { + Some(description) => Some(strip_tags( + &description.inner_html(), + Some(&channel_config.allowed_tags), + )), + None => bail!("Could not scrape `description` selector from `{link}`"), }, - Some(&channel_config.allowed_tags), - ); - let content_id = tx.insert_content( - channel_item_id, + None => None, + }; + let channel_item_content_description_id = tx.insert_channel_item_content_description( + channel_item_content_id, None, - strip_tags( - &match channel_config.content_title_selector { - Some(ref selector) => match html.select(selector).next() { - Some(title) => title.inner_html(), - None => bail!("Could not scrape `title` selector from `{link}`"), - }, - None => match channel_item.title { - Some(ref title) => title.clone(), - None => { - bail!( - "Could not assign `title` from channel item for content in `{link}`" - ) - } - }, + match channel_config.scrape_item_content_title_selector { + Some(ref selector) => match html.select(selector).next() { + Some(title) => Some(strip_tags(&title.inner_html(), None)), + None => bail!("Could not scrape `title` selector from `{link}`"), }, - None, - ) - .trim(), - description.trim(), + None => None, + } + .as_ref() + .map(|s| s.trim()), + description.as_ref().map(|s| s.trim()), )?; - info!("Add new content record #{content_id}"); + debug!("Save channel item content description #{channel_item_content_description_id}"); // persist images if enabled if let Some(ref selector) = channel_config.persist_images_selector { use sha2::{Digest, Sha256}; - for element in scraper::Html::parse_document(&description).select(selector) { + if description.is_none() { + bail!("Field `description` is required to scrape images from `{link}`") + } + for element in scraper::Html::parse_document(&description.unwrap()).select(selector) { if let Some(src) = element.value().attr("src") { let absolute = match Url::parse(src) { Ok(url) => url, @@ -197,10 +201,15 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul image_id } }; - let content_image_id = tx.insert_content_image(content_id, image_id)?; + let content_image_id = + tx.insert_content_image(channel_item_content_id, image_id)?; debug!("Add content image relationship #{content_image_id}"); let uri = format!("/image/{image_id}"); - tx.replace_content_description(content_id, src, &uri)?; + tx.replace_channel_item_content_description( + channel_item_content_id, + src, + &uri, + )?; debug!("Replace content image in description from `{src}` to `{uri}`") } } diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs index 978387b..70f8a50 100644 --- a/crates/http/src/main.rs +++ b/crates/http/src/main.rs @@ -31,7 +31,7 @@ fn index( #[derive(Serialize)] #[serde(crate = "rocket::serde")] struct Row { - content_id: u64, + channel_item_content_description_id: u64, link: String, time: String, title: String, @@ -41,7 +41,7 @@ fn index( Status::InternalServerError })?; let total = conn - .contents_total_by_provider_id(global.provider_id, search) + .channel_item_content_descriptions_total_by_provider_id(global.provider_id, search) .map_err(|e| { error!("Could not get contents total: `{e}`"); Status::InternalServerError @@ -73,7 +73,7 @@ fn index( back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))), next: if page.unwrap_or(1) * global.list_limit >= total { None } else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) }, - rows: conn.contents_by_provider_id( + rows: conn.channel_item_content_descriptions_by_provider_id( global.provider_id, search, Sort::Desc, @@ -84,13 +84,16 @@ fn index( Status::InternalServerError })? .into_iter() - .map(|content| { - let channel_item = conn.channel_item(content.channel_item_id).unwrap().unwrap(); + .map(|channel_item_content_description| { + let channel_item = conn.channel_item( + channel_item_content_description.channel_item_content_id + ).unwrap().unwrap(); Row { - content_id: content.content_id, + channel_item_content_description_id: + channel_item_content_description.channel_item_content_description_id, link: channel_item.link, time: time(channel_item.pub_date).format(&global.format_time).to_string(), - title: content.title, + title: channel_item_content_description.title.unwrap_or_default(), // @TODO handle } }) .collect::>(), @@ -102,9 +105,9 @@ fn index( )) } -#[get("/")] +#[get("/")] fn info( - content_id: u64, + channel_item_content_description_id: u64, db: &State, meta: &State, global: &State, @@ -113,29 +116,52 @@ fn info( error!("Could not connect database: `{e}`"); Status::InternalServerError })?; - match conn.content(content_id).map_err(|e| { - error!("Could not get content `{content_id}`: `{e}`"); + match conn.channel_item_content_description(channel_item_content_description_id).map_err(|e| { + error!("Could not get `channel_item_content_description_id` {channel_item_content_description_id}: `{e}`"); Status::InternalServerError })? { - Some(content) => { - let channel_item = conn - .channel_item(content.channel_item_id) + Some(channel_item_content_description) => { + let channel_item_content = conn + .channel_item_content(channel_item_content_description.channel_item_content_id) .map_err(|e| { - error!("Could not get requested channel item: `{e}`"); + error!( + "Could not get requested `channel_item_content` #{}: `{e}`", + channel_item_content_description.channel_item_content_id + ); Status::InternalServerError })? .ok_or_else(|| { - error!("Could not find requested channel item"); + error!( + "Could not find requested `channel_item_content` #{}", + channel_item_content_description.channel_item_content_id + ); Status::NotFound })?; + let channel_item = conn + .channel_item(channel_item_content.channel_item_id) + .map_err(|e| { + error!( + "Could not get requested `channel_item` #{}: `{e}`", + channel_item_content.channel_item_id + ); + Status::InternalServerError + })? + .ok_or_else(|| { + error!( + "Could not find requested `channel_item` #{}", + channel_item_content.channel_item_id + ); + Status::NotFound + })?; + let title = channel_item_content_description.title.unwrap_or_default(); // @TODO handle Ok(Template::render( "info", context! { - description: content.description, + description: channel_item_content_description.description, link: channel_item.link, meta: meta.inner(), - title: format!("{}{S}{}", content.title, meta.title), - name: content.title, + title: format!("{title}{S}{}", meta.title), + name: title, time: time(channel_item.pub_date).format(&global.format_time).to_string(), }, )) @@ -175,8 +201,8 @@ fn rss( error!("Could not connect database: `{e}`"); Status::InternalServerError })?; - for content in conn - .contents_by_provider_id( + for channel_item_content_description in conn + .channel_item_content_descriptions_by_provider_id( global.provider_id, search, Sort::Desc, @@ -184,26 +210,53 @@ fn rss( Some(global.list_limit), ) .map_err(|e| { - error!("Could not load channel item contents: `{e}`"); + error!( + "Could not load `channel_item_content_description` for `provider` #{:?}: `{e}`", + global.provider_id + ); Status::InternalServerError })? { - let channel_item = conn - .channel_item(content.channel_item_id) + let channel_item_content = conn + .channel_item_content(channel_item_content_description.channel_item_content_id) .map_err(|e| { - error!("Could not get requested channel item: `{e}`"); + error!( + "Could not get requested `channel_item_content` #{}: `{e}`", + channel_item_content_description.channel_item_content_id + ); Status::InternalServerError })? .ok_or_else(|| { - error!("Could not find requested channel item"); + error!( + "Could not find requested `channel_item_content` #{}", + channel_item_content_description.channel_item_content_id + ); + Status::NotFound + })?; + let channel_item = conn + .channel_item(channel_item_content.channel_item_id) + .map_err(|e| { + error!( + "Could not get requested `channel_item` #{}: `{e}`", + channel_item_content.channel_item_id + ); + Status::InternalServerError + })? + .ok_or_else(|| { + error!( + "Could not find requested `channel_item` #{}", + channel_item_content.channel_item_id + ); Status::NotFound })?; feed.push( - content.channel_item_id, + channel_item_content_description.channel_item_content_description_id, time(channel_item.pub_date), channel_item.link, - content.title, - content.description, + channel_item_content_description.title.unwrap_or_default(), // @TODO handle + channel_item_content_description + .description + .unwrap_or_default(), // @TODO handle ) } Ok(RawXml(feed.commit())) diff --git a/crates/http/templates/index.html.tera b/crates/http/templates/index.html.tera index 65082f9..4cef190 100644 --- a/crates/http/templates/index.html.tera +++ b/crates/http/templates/index.html.tera @@ -3,13 +3,15 @@ {% if rows %} {% for row in rows %}
- -

{{ row.title }}

+ +

{{ row.title }}

{{ row.time }}

{% endfor %} {% else %} -
Nothing.
+
+

Nothing.

+
{% endif %} {% if next %}Next{% endif %} {% if back %}Back{% endif %} diff --git a/crates/llm/src/main.rs b/crates/llm/src/main.rs index 188902f..7184303 100644 --- a/crates/llm/src/main.rs +++ b/crates/llm/src/main.rs @@ -67,35 +67,50 @@ async fn main() -> Result<()> { loop { debug!("New queue begin..."); let mut tx = db.transaction()?; - for source in tx.contents_queue_for_provider_id(provider_id)? { + for channel_item_content_description in + tx.channel_item_content_descriptions_queue_for_provider_id(provider_id)? + { debug!( - "Begin generating `content_id` #{} using `provider_id` #{provider_id}.", - source.content_id + "Begin generating `channel_item_content_description` #{} using `provider_id` #{provider_id}.", + channel_item_content_description.channel_item_content_description_id ); - - let title = llm - .chat_completion(ChatCompletionRequest::new(&config.llm.model).message( - Message::user(format!("{}\n{}", config.llm.message, source.title)), - )) - .await?; - - let description = llm - .chat_completion(ChatCompletionRequest::new(&config.llm.model).message( - Message::user(format!("{}\n{}", config.llm.message, source.description)), - )) - .await?; - - let content_id = tx.insert_content( - source.channel_item_id, + let title = match channel_item_content_description.title { + Some(subject) => Some( + llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message( + Message::user(format!("{}\n{}", config.llm.message, subject)), + )) + .await? + .choices[0] + .message + .content + .trim() + .to_string(), + ), + None => None, + }; + let description = match channel_item_content_description.description { + Some(subject) => Some( + llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message( + Message::user(format!("{}\n{}", config.llm.message, subject)), + )) + .await? + .choices[0] + .message + .content + .trim() + .to_string(), + ), + None => None, + }; + let channel_item_content_description_id = tx.insert_channel_item_content_description( + channel_item_content_description.channel_item_content_id, Some(provider_id), - &title.choices[0].message.content, - &description.choices[0].message.content, + title.as_deref(), + description.as_deref(), )?; - - debug!( - "Created `content_id` #{content_id} using `content_id` #{} source by `provider_id` #{provider_id}.", - source.content_id - ) + info!( + "Create `channel_item_content_description` #{channel_item_content_description_id} by `provider_id` #{provider_id}." + ); } tx.commit()?; debug!("Queue completed"); diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql index 9524e12..2f1e5f6 100644 --- a/crates/mysql/database/0.1.0.sql +++ b/crates/mysql/database/0.1.0.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- сб, 10-січ-2026 14:27:50 +0200 +-- нд, 11-січ-2026 20:33:40 +0200 -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -21,7 +21,7 @@ USE `rssto` ; -- Table `rssto`.`channel` -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`channel` ( - `channel_id` INT NOT NULL AUTO_INCREMENT, + `channel_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, `url` VARCHAR(255) NOT NULL, PRIMARY KEY (`channel_id`), UNIQUE INDEX `url_UNIQUE` (`url` ASC) VISIBLE) @@ -32,14 +32,12 @@ ENGINE = InnoDB; -- Table `rssto`.`channel_item` -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` ( - `channel_item_id` INT NOT NULL AUTO_INCREMENT, - `channel_id` INT NOT NULL, + `channel_item_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_id` INT UNSIGNED NOT NULL, `pub_date` BIGINT NOT NULL, `guid` VARCHAR(255) NOT NULL, `link` VARCHAR(255) NOT NULL, - `title` VARCHAR(255) NULL, - `description` LONGTEXT NULL, - PRIMARY KEY (`channel_item_id`), + PRIMARY KEY (`channel_item_id`, `channel_id`), INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE, UNIQUE INDEX `UNIQUE` (`guid` ASC, `channel_id` ASC) VISIBLE, CONSTRAINT `fk_channel_item_channel` @@ -54,7 +52,7 @@ ENGINE = InnoDB; -- Table `rssto`.`provider` -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`provider` ( - `provider_id` INT NOT NULL AUTO_INCREMENT, + `provider_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, `name` VARCHAR(255) NOT NULL, PRIMARY KEY (`provider_id`), UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE) @@ -62,27 +60,17 @@ ENGINE = InnoDB; -- ----------------------------------------------------- --- Table `rssto`.`content` +-- Table `rssto`.`channel_item_content` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `rssto`.`content` ( - `content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `channel_item_id` INT NOT NULL, - `provider_id` INT NULL, - `title` VARCHAR(255) NOT NULL, - `description` LONGTEXT NOT NULL, - PRIMARY KEY (`content_id`), - INDEX `fk_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE, - INDEX `fk_content_provider_idx` (`provider_id` ASC) VISIBLE, - UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE, - CONSTRAINT `fk_content_channel_item` +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content` ( + `channel_item_content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_id` INT UNSIGNED NOT NULL, + PRIMARY KEY (`channel_item_content_id`, `channel_item_id`), + INDEX `fk_channel_item_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_content_channel_item` FOREIGN KEY (`channel_item_id`) REFERENCES `rssto`.`channel_item` (`channel_item_id`) ON DELETE NO ACTION - ON UPDATE NO ACTION, - CONSTRAINT `fk_content_provider` - FOREIGN KEY (`provider_id`) - REFERENCES `rssto`.`provider` (`provider_id`) - ON DELETE NO ACTION ON UPDATE NO ACTION) ENGINE = InnoDB; @@ -92,31 +80,38 @@ ENGINE = InnoDB; -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`image` ( `image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `provider_id` INT UNSIGNED NULL, `sha256` CHAR(64) NOT NULL, `src` VARCHAR(2048) NULL, `url` VARCHAR(2048) NULL, `data` MEDIUMBLOB NOT NULL, PRIMARY KEY (`image_id`), - UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE) + UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE, + INDEX `fk_image_provider_idx` (`provider_id` ASC) VISIBLE, + CONSTRAINT `fk_image_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) ENGINE = InnoDB; -- ----------------------------------------------------- --- Table `rssto`.`content_image` +-- Table `rssto`.`channel_item_content_image` -- ----------------------------------------------------- -CREATE TABLE IF NOT EXISTS `rssto`.`content_image` ( - `content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `content_id` BIGINT UNSIGNED NOT NULL, +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_image` ( + `channel_item_content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `content_channel_item_content_id` BIGINT UNSIGNED NOT NULL, `image_id` BIGINT UNSIGNED NOT NULL, - PRIMARY KEY (`content_image_id`), - INDEX `fk_content_image_content_idx` (`content_id` ASC) VISIBLE, - INDEX `fk_content_image_image_idx` (`image_id` ASC) VISIBLE, - CONSTRAINT `fk_content_image_content` - FOREIGN KEY (`content_id`) - REFERENCES `rssto`.`content` (`content_id`) + PRIMARY KEY (`channel_item_content_image_id`), + INDEX `fk_channel_item_content_image_channel_item_content_idx` (`content_channel_item_content_id` ASC) VISIBLE, + INDEX `fk_channel_item_content_image_image_idx` (`image_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_content_image_channel_item_content` + FOREIGN KEY (`content_channel_item_content_id`) + REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`) ON DELETE NO ACTION ON UPDATE NO ACTION, - CONSTRAINT `fk_content_image_image` + CONSTRAINT `fk_channel_item_content_image_image` FOREIGN KEY (`image_id`) REFERENCES `rssto`.`image` (`image_id`) ON DELETE NO ACTION @@ -124,6 +119,84 @@ CREATE TABLE IF NOT EXISTS `rssto`.`content_image` ( ENGINE = InnoDB; +-- ----------------------------------------------------- +-- Table `rssto`.`channel_description` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_description` ( + `channel_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_id` INT UNSIGNED NOT NULL, + `provider_id` INT UNSIGNED NULL, + `title` TEXT NULL, + `description` LONGTEXT NULL, + PRIMARY KEY (`channel_description_id`), + INDEX `fk_channel_description_provider_idx` (`provider_id` ASC) VISIBLE, + INDEX `fk_channel_description_channel_idx` (`channel_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`channel_id` ASC, `provider_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_description_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_description_channel` + FOREIGN KEY (`channel_id`) + REFERENCES `rssto`.`channel` (`channel_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item_description` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_description` ( + `channel_item_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_id` INT UNSIGNED NOT NULL, + `provider_id` INT UNSIGNED NULL, + `title` TEXT NULL, + `description` LONGTEXT NULL, + INDEX `fk_channel_item_description_channel_item_idx` (`channel_item_id` ASC) VISIBLE, + INDEX `fk_channel_item_description_provider_idx` (`provider_id` ASC) VISIBLE, + PRIMARY KEY (`channel_item_description_id`), + UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_description_channel_item` + FOREIGN KEY (`channel_item_id`) + REFERENCES `rssto`.`channel_item` (`channel_item_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_item_description_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item_content_description` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_description` ( + `channel_item_content_description_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_content_id` BIGINT UNSIGNED NOT NULL, + `provider_id` INT UNSIGNED NULL, + `title` TEXT NULL, + `description` LONGTEXT NULL, + PRIMARY KEY (`channel_item_content_description_id`), + INDEX `fk_channel_item_content_description_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE, + INDEX `fk_channel_item_content_description_provider_idx` (`provider_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`channel_item_content_id` ASC, `provider_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_content_description_channel_item_content` + FOREIGN KEY (`channel_item_content_id`) + REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_item_content_description_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + SET SQL_MODE=@OLD_SQL_MODE; SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; diff --git a/crates/mysql/src/connection.rs b/crates/mysql/src/connection.rs index d9617d1..22ce0cf 100644 --- a/crates/mysql/src/connection.rs +++ b/crates/mysql/src/connection.rs @@ -19,54 +19,99 @@ impl Connection { `channel_id`, `pub_date`, `guid`, - `link`, - `title`, - `description` FROM `channel_item` WHERE `channel_item_id` = ?", + `link` FROM `channel_item` WHERE `channel_item_id` = ?", (channel_item_id,), ) } - pub fn content(&mut self, content_id: u64) -> Result, Error> { + pub fn channel_item_content( + &mut self, + channel_item_content_id: u64, + ) -> Result, Error> { self.conn.exec_first( - "SELECT `content_id`, - `channel_item_id`, - `provider_id`, - `title`, - `description` FROM `content` WHERE `content_id` = ?", - (content_id,), + "SELECT `channel_item_content_id`, + `channel_item_id` + FROM `channel_item_content` WHERE `channel_item_content_id` = ?", + (channel_item_content_id,), ) } - pub fn contents_total_by_provider_id( + pub fn channel_item_content_description( + &mut self, + channel_item_content_description_id: u64, + ) -> Result, Error> { + self.conn.exec_first( + "SELECT `channel_item_content_description_id`, + `channel_item_content_id`, + `provider_id`, + `title`, + `description` FROM `channel_item_content_description` + WHERE `channel_item_content_description_id` = ?", + (channel_item_content_description_id,), + ) + } + + pub fn channel_item_content_descriptions_total_by_provider_id( &mut self, provider_id: Option, keyword: Option<&str>, ) -> Result { - let total: Option = self.conn.exec_first( - "SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?", - (provider_id, like(keyword)), - )?; + let total: Option = match keyword { + Some(k) => self.conn.exec_first( + "SELECT COUNT(*) FROM `channel_item_content_description` + WHERE `provider_id` <=> ? AND `title` LIKE '%?%'", + (provider_id, k), + )?, + None => self.conn.exec_first( + "SELECT COUNT(*) FROM `channel_item_content_description` + WHERE `provider_id` <=> ?", + (provider_id,), + )?, + }; + Ok(total.unwrap_or(0)) } - pub fn contents_by_provider_id( + pub fn channel_item_content_descriptions_by_provider_id( &mut self, provider_id: Option, keyword: Option<&str>, sort: Sort, start: Option, limit: Option, - ) -> Result, Error> { - self.conn.exec(format!( - "SELECT `content_id`, - `channel_item_id`, - `provider_id`, - `title`, - `description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {},{}", - start.unwrap_or(0), - limit.unwrap_or(DEFAULT_LIMIT) - ), - (provider_id, like(keyword), )) + ) -> Result, Error> { + match keyword { + Some(k) => self.conn.exec( + format!( + "SELECT `channel_item_content_description_id`, + `channel_item_content_id`, + `provider_id`, + `title`, + `description` + FROM `channel_item_content_description` + WHERE `provider_id` <=> ? AND `title` LIKE '%?%' + ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}", + start.unwrap_or(0), + limit.unwrap_or(DEFAULT_LIMIT) + ), + (provider_id, k), + ), + None => self.conn.exec( + format!( + "SELECT `channel_item_content_description_id`, + `channel_item_content_id`, + `provider_id`, + `title`, + `description` + FROM `channel_item_content_description` + WHERE `provider_id` <=> ? + ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}", + start.unwrap_or(0), + limit.unwrap_or(DEFAULT_LIMIT) + ), + (provider_id,), + ), + } } pub fn content_image(&mut self, content_image_id: u64) -> Result, Error> { @@ -107,9 +152,4 @@ impl Connection { } } -/// Shared search logic -fn like(value: Option<&str>) -> String { - value.map_or("%".into(), |k| format!("{k}%")) -} - const DEFAULT_LIMIT: usize = 100; diff --git a/crates/mysql/src/table.rs b/crates/mysql/src/table.rs index 631bc37..867abb8 100644 --- a/crates/mysql/src/table.rs +++ b/crates/mysql/src/table.rs @@ -13,19 +13,30 @@ pub struct ChannelItem { pub pub_date: i64, pub guid: String, pub link: String, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ChannelItemDescription { + pub channel_item_description_id: u64, + pub channel_item_id: u64, + pub provider_id: Option, pub title: Option, pub description: Option, } #[derive(Debug, PartialEq, Eq, FromRow)] -pub struct Content { - pub content_id: u64, +pub struct ChannelItemContent { + pub channel_item_content_id: u64, pub channel_item_id: u64, - /// None if the original `title` and `description` values - /// parsed from the channel item on crawl +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ChannelItemContentDescription { + pub channel_item_content_description_id: u64, + pub channel_item_content_id: u64, pub provider_id: Option, - pub title: String, - pub description: String, + pub title: Option, + pub description: Option, } #[derive(Debug, PartialEq, Eq, FromRow)] @@ -37,6 +48,7 @@ pub struct Provider { #[derive(Debug, PartialEq, Eq, FromRow)] pub struct Image { pub image_id: u64, + pub provider_id: Option, /// Keep image unique by comparing its data hash pub sha256: String, /// Original `src` tag value to post-replacing diff --git a/crates/mysql/src/transaction.rs b/crates/mysql/src/transaction.rs index c2a2077..970aaef 100644 --- a/crates/mysql/src/transaction.rs +++ b/crates/mysql/src/transaction.rs @@ -36,6 +36,23 @@ impl Transaction { Ok(self.tx.last_insert_id().unwrap()) } + pub fn insert_channel_description( + &mut self, + channel_id: u64, + provider_id: Option, + title: Option, + description: Option, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_description` SET `channel_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", + (channel_id, provider_id, title, description), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + pub fn channel_items_total_by_channel_id_guid( &mut self, channel_id: u64, @@ -56,66 +73,88 @@ impl Transaction { pub_date: i64, guid: &str, link: &str, - title: Option, - description: Option, ) -> Result { self.tx.exec_drop( "INSERT INTO `channel_item` SET `channel_id` = ?, `pub_date` = ?, `guid` = ?, - `link` = ?, - `title` = ?, - `description` = ?", - (channel_id, pub_date, guid, link, title, description), + `link` = ?", + (channel_id, pub_date, guid, link), )?; Ok(self.tx.last_insert_id().unwrap()) } - pub fn contents_queue_for_provider_id( - &mut self, - provider_id: u64, - ) -> Result, Error> { - self.tx.exec( - "SELECT `c1`.`content_id`, - `c1`.`channel_item_id`, - `c1`.`provider_id`, - `c1`.`title`, - `c1`.`description` - FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS ( - SELECT NULL FROM `content` AS `c2` - WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id` - AND `c2`.`provider_id` = ? LIMIT 1 - )", - (provider_id,), - ) - } - - pub fn insert_content( + pub fn insert_channel_item_description( &mut self, channel_item_id: u64, provider_id: Option, - title: &str, - description: &str, + title: Option, + description: Option, ) -> Result { self.tx.exec_drop( - "INSERT INTO `content` SET `channel_item_id` = ?, - `provider_id` = ?, - `title` = ?, - `description` = ?", + "INSERT INTO `channel_item_description` SET `channel_item_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", (channel_item_id, provider_id, title, description), )?; Ok(self.tx.last_insert_id().unwrap()) } - pub fn replace_content_description( + pub fn channel_item_content_descriptions_queue_for_provider_id( + &mut self, + provider_id: u64, + ) -> Result, Error> { + self.tx.exec( + "SELECT `t1`.`content_id`, + `t1`.`channel_item_id`, + `t1`.`provider_id`, + `t1`.`title`, + `t1`.`description` + FROM `channel_item_content_description` AS `t1` + WHERE `t1`.`provider_id` IS NULL AND NOT EXISTS ( + SELECT NULL FROM `channel_item_content_description` AS `t2` + WHERE `t2`.`channel_item_id` = `t1`.`channel_item_id` + AND `t2`.`provider_id` = ? LIMIT 1 + )", + (provider_id,), + ) + } + + pub fn insert_channel_item_content(&mut self, channel_item_id: u64) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item_content` SET `channel_item_id` = ?", + (channel_item_id,), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn insert_channel_item_content_description( + &mut self, + channel_item_content_id: u64, + provider_id: Option, + title: Option<&str>, + description: Option<&str>, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item_content_description` SET `channel_item_content_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", + (channel_item_content_id, provider_id, title, description), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn replace_channel_item_content_description( &mut self, content_id: u64, from: &str, to: &str, ) -> Result<(), Error> { self.tx.exec_drop( - "UPDATE `content` SET `description` = REPLACE(`description`, ?, ?) - WHERE`content_id` = ?", + "UPDATE `channel_item_content_description` + SET `description` = REPLACE(`description`, ?, ?) WHERE`content_id` = ?", (from, to, content_id), ) } From 1bee7daf773f6595f0d4a1b2e16effb54c3a596e Mon Sep 17 00:00:00 2001 From: yggverse Date: Sun, 11 Jan 2026 21:02:35 +0200 Subject: [PATCH 64/65] apply missed db updates --- crates/crawler/src/main.rs | 8 ++++---- crates/mysql/database/0.1.0.sql | 8 ++++---- crates/mysql/src/connection.rs | 14 +------------- crates/mysql/src/table.rs | 11 ----------- crates/mysql/src/transaction.rs | 25 +++++++++++++++---------- 5 files changed, 24 insertions(+), 42 deletions(-) diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 62ecd8d..cb6b8d2 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -201,12 +201,12 @@ fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Resul image_id } }; - let content_image_id = - tx.insert_content_image(channel_item_content_id, image_id)?; - debug!("Add content image relationship #{content_image_id}"); + let channel_item_content_image_id = + tx.insert_channel_item_content_image(channel_item_content_id, image_id)?; + debug!("Add content image relationship #{channel_item_content_image_id}"); let uri = format!("/image/{image_id}"); tx.replace_channel_item_content_description( - channel_item_content_id, + channel_item_content_description_id, src, &uri, )?; diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/database/0.1.0.sql index 2f1e5f6..443595f 100644 --- a/crates/mysql/database/0.1.0.sql +++ b/crates/mysql/database/0.1.0.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- нд, 11-січ-2026 20:33:40 +0200 +-- нд, 11-січ-2026 21:01:10 +0200 -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -101,13 +101,13 @@ ENGINE = InnoDB; -- ----------------------------------------------------- CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_image` ( `channel_item_content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `content_channel_item_content_id` BIGINT UNSIGNED NOT NULL, + `channel_item_content_id` BIGINT UNSIGNED NOT NULL, `image_id` BIGINT UNSIGNED NOT NULL, PRIMARY KEY (`channel_item_content_image_id`), - INDEX `fk_channel_item_content_image_channel_item_content_idx` (`content_channel_item_content_id` ASC) VISIBLE, + INDEX `fk_channel_item_content_image_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE, INDEX `fk_channel_item_content_image_image_idx` (`image_id` ASC) VISIBLE, CONSTRAINT `fk_channel_item_content_image_channel_item_content` - FOREIGN KEY (`content_channel_item_content_id`) + FOREIGN KEY (`channel_item_content_id`) REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`) ON DELETE NO ACTION ON UPDATE NO ACTION, diff --git a/crates/mysql/src/connection.rs b/crates/mysql/src/connection.rs index 22ce0cf..35c3469 100644 --- a/crates/mysql/src/connection.rs +++ b/crates/mysql/src/connection.rs @@ -114,22 +114,10 @@ impl Connection { } } - pub fn content_image(&mut self, content_image_id: u64) -> Result, Error> { - self.conn.exec_first( - "SELECT `content_image_id`, - `content_id`, - `image_id`, - `data`, - `source` FROM `content_image` - JOIN `image` ON (`image`.`image_id` = `content_image`.`image_id`) - WHERE `content_image_id` = ? LIMIT 1", - (content_image_id,), - ) - } - pub fn image(&mut self, image_id: u64) -> Result, Error> { self.conn.exec_first( "SELECT `image_id`, + `provider_id`, `sha256`, `src`, `url`, diff --git a/crates/mysql/src/table.rs b/crates/mysql/src/table.rs index 867abb8..2c9218d 100644 --- a/crates/mysql/src/table.rs +++ b/crates/mysql/src/table.rs @@ -59,17 +59,6 @@ pub struct Image { pub data: Vec, } -/// Includes joined `image` table members -#[derive(Debug, PartialEq, Eq, FromRow)] -pub struct ContentImage { - pub content_image_id: u64, - pub content_id: u64, - pub image_id: u64, - // Image members (JOIN) - pub data: Vec, - pub source: String, -} - pub enum Sort { Asc, Desc, diff --git a/crates/mysql/src/transaction.rs b/crates/mysql/src/transaction.rs index 970aaef..a087ece 100644 --- a/crates/mysql/src/transaction.rs +++ b/crates/mysql/src/transaction.rs @@ -106,20 +106,20 @@ impl Transaction { provider_id: u64, ) -> Result, Error> { self.tx.exec( - "SELECT `t1`.`content_id`, - `t1`.`channel_item_id`, + "SELECT `t1`.`channel_item_content_description_id`, + `t1`.`channel_item_content_id`, `t1`.`provider_id`, `t1`.`title`, `t1`.`description` FROM `channel_item_content_description` AS `t1` WHERE `t1`.`provider_id` IS NULL AND NOT EXISTS ( SELECT NULL FROM `channel_item_content_description` AS `t2` - WHERE `t2`.`channel_item_id` = `t1`.`channel_item_id` + WHERE `t2`.`channel_item_content_description_id` = `t1`.`channel_item_content_description_id` AND `t2`.`provider_id` = ? LIMIT 1 )", (provider_id,), ) - } + } // @TODO upgrade to the latest version pub fn insert_channel_item_content(&mut self, channel_item_id: u64) -> Result { self.tx.exec_drop( @@ -148,21 +148,26 @@ impl Transaction { pub fn replace_channel_item_content_description( &mut self, - content_id: u64, + channel_item_content_description_id: u64, from: &str, to: &str, ) -> Result<(), Error> { self.tx.exec_drop( "UPDATE `channel_item_content_description` - SET `description` = REPLACE(`description`, ?, ?) WHERE`content_id` = ?", - (from, to, content_id), + SET `description` = REPLACE(`description`, ?, ?) + WHERE `channel_item_content_description_id` = ?", + (from, to, channel_item_content_description_id), ) } - pub fn insert_content_image(&mut self, content_id: u64, image_id: u64) -> Result { + pub fn insert_channel_item_content_image( + &mut self, + channel_item_content_id: u64, + image_id: u64, + ) -> Result { self.tx.exec_drop( - "INSERT INTO `content_image` SET `content_id` = ?, `image_id` = ?", - (content_id, image_id), + "INSERT INTO `channel_item_content_image` SET `channel_item_content_id` = ?, `image_id` = ?", + (channel_item_content_id, image_id), )?; Ok(self.tx.last_insert_id().unwrap()) } From 2bb7a2da6928b7fe31b7e17bc540e61b91b2b68a Mon Sep 17 00:00:00 2001 From: yggverse Date: Thu, 22 Jan 2026 15:52:17 +0200 Subject: [PATCH 65/65] add `database.mwb` model, rename `database` dir to `version` --- crates/mysql/README.md | 3 +++ crates/mysql/database.mwb | Bin 0 -> 13932 bytes crates/mysql/{database => version}/0.1.0.sql | 0 3 files changed, 3 insertions(+) create mode 100644 crates/mysql/database.mwb rename crates/mysql/{database => version}/0.1.0.sql (100%) diff --git a/crates/mysql/README.md b/crates/mysql/README.md index 681e048..62e2c8e 100644 --- a/crates/mysql/README.md +++ b/crates/mysql/README.md @@ -1,3 +1,6 @@ # rssto-mysql Shared MySQL database library + +> [!TIP] +> See `database.mwb` model or `version` directory to deploy \ No newline at end of file diff --git a/crates/mysql/database.mwb b/crates/mysql/database.mwb new file mode 100644 index 0000000000000000000000000000000000000000..c6dbc34b169592dda4970213d118ee75e4e091bb GIT binary patch literal 13932 zcmZ{L18`=+vv0iN#(_-9Ht1NGJ@jf0w-nmp->!It0uo|L=kNSLbME$I8jc&JL#XpMDTvuwdbz z{ZxE*nLC=f11ucem;j!pOkMzcpn;AnK5ueBfl<-yFU7D!h*J+Y zyA?pSFX4VQrLV`4Y%qih+Nc1BfW2M7nY$9(UfiL$1_^qTcwYeSLR+=E(>bz{S$W~Jml586*c%?9#nE6{INfGK0blefr4*N zfuQ4;ln?y4UJI8oM`fYdu=9eS@Cef(3%9+wxvw{QAgdem_f?vgjDv-Rl;3Uw`wQ6O zTtUo%3}v}J2HgjDrPKj}ac>kLLA35fo!uT1iU&jUZzw&xa26DWVblTlB;R-B-T`d* zuRf=5vC+zn0*Vt4XDx-oohQ!A!g_aBbFs0>QFh$q$TXl=9GJ_T?hlX~sSlI)c4K6# z!H(kS$}oHCcL8M@0d_}+Uhd$Ln>i==!5h;agb7Daw?ja}R_vPb96aj>2d}@*T3Dta zhv?n+j5OriHFG|^?r`YF{c8%&%l<-~!X{+)I3X zK#uQV&Ts+NrLf?~p~ce|WP&RlLV?Wq%_e4S8s;G-@L^`Vk13e(aTP|mB;Nyn_hA6J*t!E8TZ~w* z*(SU6_^{!LL(uKd+ux%HL!)T{wQod>ru?uu^>iW7FyvT7+wFi ztJbmF6+bfKvkRr|)APQy|4!G?@nWIYM7UIB9d+#6%a31Mh`Z89pmnod zylff>TREA7;WaHPVh-2_`F3-^O^%x-hKpo1=>$GM>^+pzTWgGT{T|int?N$BSJ2hu zwXs8*c{#Uibl(76+&rx$XkETPdR=>H?Z-wGaX;~7Px~rQQruzFJ%|6fErj=S{E46- zsj$0AszJ2lUKpn6wuHY4I#equer;r-QJQ(Ou2&f`OG2WGDaf#UMbvHaXzMYjUPDAyt!Hk*CTR}9 zZ*1ILkzCGbDI!J^4-*dRy|mXqU+pAy;=0%{D+ozE3eG>3efnJwId&^XYQPw)M9vzM z_aP;1ahYCTL*I5~&aA zbQL8EXW%)(Vsp(0TbYu&U1ZA4^IVeyZe`6W6jhm&K-`RHJF&(Aj$fqi!Kpi{otLg& zGGnhgA!$rVE1oTFCCGOs{0!%Q0s~qa!R325ne;tNfb`rCVG~95far*=n(w7{2qp}! zyiBnX!W28LN{4HdnkY4g4CTp-$Z-! zd%74rn+VK)?h#7xMcH(6WUs#!v|Nf`tADu6x~exQzc=j{nJ zuugGzzLl`TrKqDDUOKEvO7bJ4NMrSDwTg3p=RJR2wK_Q}HxwsN zOVX8lyL^ro>|Z2CK9*GBin`;r(Fn8)oDWT(A|(AYPkR9%rp&!1$A-Ie4{Ldz#aH|D znF-DAG;c@k_q%5Qgx<>gR!4$@@K^*+U#At7e(l}>flRLm*H#oRX+d$^6u7WUtsHSW zDd4Sqco%OxRt90Z!vJt9GREur$b+kSju4rR3JZdv+*siG+BfUO;_(%V$$)zJqBqqE zdJ<6T$231mwP3Kk*^2*HYzULyAzE0xeN@8Cpbj?60_NFVEm9L|4X~~u%XAN{5{PkE zi}1jPS=`uOl^PnpDj%cZ3FIb--+s4bs@Qx<2XsIXsNyAn5Wekoz~&G?I5-fgZkOLG z(}^_rc(rFwvz53bCu|Q+}FJ*2Im3Pfp4 zUPM-FHxM-k!Zt>*<{Roo)|lkBP|IS@)k*Z$crJi}b>~>D%u?7O2UaEXK?UrZ+FPMHH zaHRxj9&^n;A7$8lYP(QvT`q^`1AMv5rahP$-P1p$3y>bYYha6SPazO_&?%{ZI8Zm7 z{?7aHcya*BVZIddcLP&bUTxVuHa*y9xNRREbA%(e6bEZh;)?m59p-CxkD8&A%HmSJ zF^`G|&`EXPbzm1#++W#Bzn4(4OnF$nK#r)k2JlZ^z+*Qbd8~JJqy=^v#`LTcvea1z zXTi3z#VrR5-|O~4(s&KrrqdoRva*q8(W}Z(R24}|B2b^03ngm*(yx)t+QItxEWlv(EkPVaZc!*J0 zeR+16H?Q*gp6zZfXR|s$uEshoaFun>^KUs9wJ2I&B1`3<0K^b-uC_+ous?p}QM$$; zW6pa85H+P`q6KT3^4CZ#V0M3*U`@7xNoy(dknjVfliOL$<(cNK&{qV?|adpB$EnjdnOKm4e;m^JrZv}@g69zqY zgne=jD}?niU45O9+MZzlX##?;vMv+;_1OxR@%%Qs>*W4rKr2Aqb1V2}9x?`bl?{Tp z3Ig0o1j-rh>{ZI;OAEH6pv2GoclTPXg;B$Nf4%=VN2YYZM*_ zUDtnomB$*;eaw+eA|4#*(ua#W8YR*xapP!i$UVXJ{Hfp`Q1$e}YY$6V55~oQGLWM% zythezO5#+g;K8`dWLlbRr{m9>j8lN@hR6>1g+hTu4P}Ql7+2~up!Bj3yhzNHNtgbF zA49-v>WErp^$>?GnlxohdG>m;2|`%b<6JbQmCAqg365q}+qq8;!lxILtEe|wag8Xa zp1|se73eILQxDEn6mr}#$5k}WB>nkY27Xq|VSE@7dowA*6_I9#16dF>v>2!sjm6?7 z!m=5xGF29?!|>>(N&i(L{UcKzVAm`~<@<1)q)P@El|1@lidS@nEtHE4;vB45BUF7L zm1qvO6ropnzI+)-=?@aBTA9+{{cNClk^zbAc&L|%Uy=R400{R;{t?j}nKHL1(zzf9 zyhQn?=~+IbM5P;BL}+}&@k$)R$mxdbN?2{pE1)QVGH+fk(`-Ry6%%g2;mp`m+M0iW z>8Nfwx?{SbH4Y~RPLK)RM{Y%;@iL6aPR~Z;Ike{h!QYqmUOU;fkhXq~-Y%eqy20+& z@!^{zemULujxL{#%^%~f-^)&m$M1c5(RXBbV_&rCSAA8xblV2de3MgPgt5tqy?1k( zaGq+*&f>tx%xpYn zn7{Yjt=`7u5`wz2Jbvzp+b$>3I`8EP-6d7xSg9U)S;w=lhgqZRZiLNGEQ8m}Y z?fCooUMH+KO=}QT@zck8Zkj8YA}fw~WX39`vmaHP{pxJ;1MQX5fLM%rHl9vBh@>X9 zM*ZCX5l}5yvP5RxHpmYo;`#0IT@*otDvk!MqxmnZ=avTATG7oaI_ecm6zQ0lgdb}o zTeL@)uU^JzzRPxUNMiDVKGiyVi1+!F!Vk07xzC+m@lxPEvwpywb_ZQMO(jWvm3cVX$wMW)->yjmmN}>R+s8X zyp2_#&3<1!#nXc((%p!xWwTcaF6%uth~#>;-OKEDER;EecvCOBiG3*&L@CUpUI3UX2#T zcG}Gf?mI_?1QXabe*mhKD2qtv3G3YGv=O|@NIhvF2F%{og`Nfb!3^NBodFAUZl*aZ9t*-dJ4+|jPCJ1!*PrSKew#6mp+C+4!_ z*M#mMv5P~v^(bIHDrkjCQ@+m@j;Wfmlx@%!ZtG{dv1e`U5?Z}3#D5w#z{8AN!Y(xB zWt3%J#}{hT6RB63S;`Vdv zO{Jig#jX9f-7nB5yBzekITf<|d|)#dw(5Og@vFx>>648qw>c8!`@wiQK(gsg`YdAA zRgL5z*&Grc@qxN~QI^qU3D|yM&~8ugC2KzuGOY5x1lhRvXU%4DfV`>G2KyX%||LPhP${@an@{64I<)qN+sN>o7w8TIfY^RgX}rP=eSr) zmYb{$=Tn{7*Ce*=wc@#SNm;xhYZxs~)Y58+IAad-Ei;blc8t|Kdg}firk#^<|*TsZ@>xqwNHHq=-H2P**Wy z9?9iJ22Tm`l#PU*h%kJS5AL62yU5d}5o(mV(r`Zrmg!s-(Uwu%C#ch7bd_i;07pU4 z=Pn`yet<6WJa^#=8>HiMP5?2j#I{vRtZ}WuJwX*q3n)_5j%80M3y5V}4+DLwAuSuu zPs*)ojb5y{3LQ^7Rh?}{UGyDJ9I}xR=70$6f#_T@jXukAEeFoV46f8P_f}o5nOo1= zaeW78#olqVY7T@6x8Yi$)VI;(p3ku18B$760YJb0Cp-aqqlwughV(e9DN)0V1sT|8oL566;#Fq(I8v?$(Kh(Xh_`A7(W3d8U-C0y zoSQHoQuhtL(GOCy6_tP5Jc&X3?$1uzv)wl?&ZKmrU}vrAlmCoDb=He)=a(BN*(2Lq z9*#Io1_Na#avp+D4Q`76GjT0h^DRD2T%Gx}rkcpHbtH3xNrE9XM+4=rQhAoaN3+D6fo{M7}4Wy7GS(g(p-$wtKb zHW9Kuwr`HtoG9>hBn$E+91HZLX#0Oc*+gtb02shp{}f5H|A?f*e?*c}?2lp_>!=b{E;Xud46q&x zpmjsUAe%;?PNkEabxa9hNYIOKB7xIQW^ASaItJObQUIA%=aMDl8>`a~$Sa3Y@spwJ zeXM0#*AvwZ#$ITgsccfC(8yeSF+C_`*R@r(ymS&HX#bhD(x+u>`G?i+h4t)g)H5)k=f_*F#X{_%&iCeL21(Nht4fdDG7KzwtgAtY z049XTGTC_O*G_}J8K!(}ghvx_VTRpE=+{$X3$AL{A1FU z2FkKwpK=@z9R0%y#uTsmZ=viPj)Z}E?0oljb7Ch!m;idY`|u(}$lnk^-2P;LMf0-B z?!J;Sxiu!uanpGoEEKVk2`MZTy3?17YKMYgr$IeF%!&Qk*x3LD$N_Of?0YiuX1j9> z#D+Ha)g0)ko~WLQ!h@{FgPpFw){Rz9@Ew zNEBt_k(Y5_OTxclIqjO>vFJ^Gr;t}Yj`ZuMQF2tDMO@KjOz4jbJaM{*k9a#o%r`7; zGgv91Qnn`CC-PcW^B5^TEDId2AF;%rM^{PghLKqzJ48W*cip#ewN8p#ZeVr2kvAjZ zaAQMZSHgNn{S0(1p&KRy#AfJ)+|tuNv_U!wDMWPP2Fh$2l-)6r9&po2r9A_p)6&sd z;{bX352=l4l5m=uOw(WVesddAe^b`!0E#<7f-z(*Vd31Klv2w89$zEKn?X`x{S51N z`h@*arLnxQr-qa;k&FoGA7S~LBIqt>zx7t3U5ViSXPg5vLins6@4SXYWynPSB?u^iQK zmD)M3vVR8wG{AVzT519Fj=z`xld@fPvbJuTCh%{>+?W_~+=vHG99^|LGM5R8PuK*F ztukC61LEiI4--AMMqW<_xTpVZil=X%K}Rgs}^~BIWjpX;6qnEe(uJ@Kx|pK&c?PHe!9& zX1Z9|7+{UdL7vowK_;v_3%NT>=d#>}Ga3uQrPm#N%Fxs+yuvh=AlsG?3aFUe9JWtT zS-ych%7z6y3J5jJ8iF1DWPg20|wkxnOOugG3)?+sGj>!qF^6NSRY16 z!OUg?QDk`-BJtmSf*~a7h;I69DkyWE$-#%i9A&B-;sHNSCI54F%GG2h3oCWkVpfHN zb28|OLOC@7k>PlwJqJ&X5!-;sK9z4PP0Eov^SFQ1m14~BT+pec0W)qpEtCBMaXa;% z+s%=G8U{mMtt<*BR_cSQY9b2OGSWC5IOK(BmNS$Ej<+zpKz!jQib{k4Ed#ArEG0Cz z=>EGL0$%TwLA0o7C{`*?sjQd)jdiU*NCjJvyhzB7RFFKD9aLIioxpxGvos@Ik$k^u z{mz?a5dZ2rCLEc%l~-{N^IAGOEpmKrzo0nofT`@!WG}XZyskY`hGT^VwZ0$AWdb?g zu5Hx3WH2)P{0kl+)(pq5g{K07KOUQaMk{dPAm1>Nx&TC9_TQ^_)Z3&3aaGpDHkt^m z#+pBi@Yaeqf0wCG+QP7&Z|zGI4vapm%wWMU>RBW7ZCYkqvJtb!^Qxtll@ZHsp6mVX zJ*hoee}wnUZFl8J_vt#2M~fJEkj<3aB=UkfJNOh(9HRbu3X^?h&7zWXK>-~TSuqCL z)t2P20ptb>lQwy@b;&Pg|LUSB;TkCZ+zb3blt=UErZD+or{R=D&`d!Jkdhrh;wIJ3 zb|Q8Ua+Ak!czHgV57#E%z&Pp#-fu$9RKjfWz5u+0c3R~-!X9MtEv;F4l0^ zfZ~LPQF5YyGkNq5WqALw*dj2mgv%cvl-z*-4I1NXw7AYtpZi`(XPsX%FoU{nS(buxd!8APn z&F@1+HDg5#(*Bb^;#W(&s@yIdS@2frdZ%hu#`kqy{X=;JxaPs^CKN7IwhLagM2!Mj zXFQ*=DL(=p-_)oVQWEFBWlH8>>~4Y%1pS*#Z5J+gV_fjULj3WS5{}arsTR70MkX}$ zFDOxD1AW>+n3-UhqJrO+PQz;Ydn9HFVm{X2UU}@ocY!O+sNOy(@^^3Ed7%p}sD;E3FcTN14g|-M}f>j z-OF7Pq~G$Zuo)_IBWo#HjCunv`3kMA%&ToAs>km)J@!rhSk9BxFJ^aL}>UANQq#3lWs824m$%xu&Eg#I%!a3t%5El&UUtqtoQU89^zfKurHan z{cJE7f$N0^VVyjuXp3pw8dfZ5rcf>@x2L1sJR7%vn4)b51$Z=DVQ+cj;_+_Lz6l=z zyf3y!t7GItkALa^df88ineYfc{~9+dnW#8$5o}*KNh}ssOKf`UH#0iUQG;Z<@Iw*> z{|gbnZIq&r8I_quJdC~UJ!@IbxOCgF$j z*`GrUxjM*trG2ueLw(TS%eP%dY}g-YKgB$G7@ntp56EI~x-at{iPDw{N(BA!ulvCY zKnH&y0k9dr{o;47zw>Dd=J&p!5G{}6H}MV}(*HDfRy-->Vl9pNy3nY2oub-pbx~bT zWEN+OYA)^18!a4$JEGVPft70B;dt2)pd@T!4@_ue^ikqm6OS#HaFg54M{_&jN5*L?8XF%D~D9_F*k8P=f@WkUH^ZgSnR zZUkUlP8D9Z<2G^GuflRSR;KY*dV8H))iTJxEROg;zT~s_Lyp<}?I0LaExeu6=U6yQ zv=p?`1i|WSOlou;IB_|-|4dTbA0qluWas|%6+UB5St+{g8GfQS; z6E=|f?_a4w>B|!49+8a~SZIhoW%BgFD}U1F0ou2}8ow=fC*EN1W&U(~iF&*=X%VoU z8@wuKIP z_9qjw%Y1Xb-xa=wfIX`&2V%1WQ9;iy+tQ$~)kD7$N@#P?)5Y1Y6J&+(6x)95%+Zj| zxOT0E_fHwamp)Bc3fQe=e_vY#GZ7tzV5r<-Bl+7a*wxija2I5Id;VY1aPL8ukLC-* zjoRa?gLI#SQC{TsfEqG&lVgr6NKVDkmy&1hZAOTSzcIS&`lcn zx<`QJk1;-3t$U|eciehsJc|zTsC%f`fcKpUjDLv>p;GRvHsO9H(CH`qoE|Pcho5t-OTb!b@zSqLQ?Q*-=PG1W;5jPA)vENHQ0L5kn26D7GEP{~+3s9_Xg zrH~?M4DSpk;l9o@i3g-mocHQsDC+D`Ppr^#W!@&;HCV88f-d+@KVG3jP($WO`(Rr| zMP5@i0gzB7Z=rMG-R2wpQRTUksK~Kc;2}wnET(Xv-Q^llhdCPm3|KB2@rSFoR_b8ugkMhxw^k+!`Vcx3(b zB}yYo<4YgiDrRDn^h1J%mu1yZIb`3Uhkc01_e2qmc2iU*x5Xut{k(&KI9U`RVS?CYEQ}4_EJ?Af+!aw}2a{k}MfM-s33g={EDosrqYQa3Yq7*C zxYEYlLjf!=YrZ3KwS2YTdn^Mn<=HCHUnHK%QsTtQZUVzd!-^pLQ0&3-(eOwSmvW<< zKN2CsNFP{AWwl%PFg8YZekEN*ZOjwaPWc-KI7ztPmHp6a!Y~r4l2h6`(j~-&qc~ox zyfKHQJ+f!X)qm|a6aS_ngSu7w_5)t4)S5nO8%2&qCdZ=>1#z|gf%vJFyW_0s#a0X- zy39yvqESKg$u;R5$I@Ae@PbQwgPU8Vw?fcE`8?tswz)sB?6+fYuYWDSo2uuc*6bfF zst>G1J~%(U;rIM4;d-U}iGrC00Tf*1Ecu&~Uz0^W9E*fypRsIq-3>^t+=h3JOJNId z9DlI<_>00%e~+uMkEss=yExfKmJRBidP-A=0IZTU3akti1txHoxXBW(tfr3Kr z(B?YCdEsxuk(b#m@Gp#cUQ7#~N1gm-!1*m%+nTPogDAI$_|?*$Q$Si5_2LHSU!bY! zu2G1OZ|JttePP>MA|+|u58s@eWRDYIX7(unZGq^I{JZ5i0(_rVP5mZ z&AoV?fi8cnVoU2~5;=EIot@et9mn{OaXXbm7FL0hWZjfp&pEJ!bxgm7gC?f)0`TZ|N}7v*+`klvxM_8zwmT99h(TUz1Fy3=JLK{? zZEx^6coh^9y>Csa2{@m8aEex#M&%|3VEuQrlU_t!&f_t~!+-I!+oZ&x{&bw4R}o~V zeE_hh2z-*WVWr|nR zx2!aj08J9rcs4%b8S7uSi1+8t9avqshi6h2GMWA+9~uMDka$en7HzE~vy;929AZ?57i<972FpPcUF zR?n}W1a(f9M6Biwkr)H9xYoI$s4~4=}Jgs_?XJ%qj#xg7YLlLm4 z=%m(45b~{(s(=w?IQm)T^}{^-m7?U2e)x$(3ZoR}gm`(f9eag<@nSdr{+u-*wqA5` z4vIXqe@n3Kp#F2=%maiPbDE*jfcOuK9QkAe+qDH;yMc$2 zs0zKULy)zfn!lrj+>e@&4l94wJDY3j+ZKu!1gxzL%+9-pmnQ!H@|(6(JL`qY?fXsT zE$po`p{s&9Yp|JI0H3tb%jpxgUWMOJOyT%)`=5oxSI1?Eyg8tX4Rb*yK1}cY(W8|2 zQ7<3P@GBhfT5Y;B(%1HKW(~1#Fx;UhYh`0FDU_3G!Ud2zS8KZgH<#}^Ps6Xr6~4Bv zZ44x^cy1aHJ1(Rd#LN7k^St@nZl@GD_738knf-6uXTrQTKrY&Ja#s+IX_B&nYE%Zv za2&M`%8g~nQI~SJ4#9DNZl4A0>$z`{E%Y5mksH^;}V0q*Yg;7I(Lc^rh3a;-k9vIvzi50O0$9C?wX+KnT$)J z?q9DJt!`;Q=x*ENfpv;+EQ&q00~NPjIvxgp4FJ~|G%?l}(jGl*S&Z;M^_8y(9l7!t z)2uDuoW_B5DXAmi^tG^oXAGFnyU+jSzMBmQl}ppEZXDq^%=6|q?724#1*u?!+;?-- zUy^jcRlJ{>;RT6@XfkJ3)a_`dFPwt}4)D&F%wx^)1|*?87i629Eu%9e8aNyWnD8o2 zwI;ly2Jf|)hw<)cu7L#J`$03u<>$EAF2M}F%$_n=_1pn(E?sqoI}x7tReG<=q0 zXt-6wwDj30^GOgTHO5uk`RX!Pm9Vk#4@>73Oiv;F2N2~g54^>ZhWH12oVFr^{!=Rr z{j$T-29Y79f+31k)@WR(Su^o~N<^@Ea2BfZ{3r|KMFpfIbC*QfBt>pwgve803f=2_^rhCz&dh{bU9i0pIhBqDn0Z6r(XpBteW{mncfISd zVNK#55?N+}KZlQL^OwCoGg-P6+q=*b+68RFkuat^gdGI8>RBvnN+n1C6gkoOJWg=B z(x%L3SL4~!$ls?S85RUwa-a!?mVEfMjjwE#wT)_@33%r;T4rW{f^w+h(1oi4f=^~k zO&G!pCTvfrBFg13EvOHBJvoA>nuKq9tcQ!OeK5Q>4XEQVEXQ|w&~CSytWRm=yhX}n zs$^X>UrsFFcWb>feU}_tvW1C6M#XHGm`iBYiad6vn0gEZ5Q>mxKSh>3lom1&jM_Z1 zu~n2SZLjsK9lI8{ME#R&;KyiE@re6K<#FeR6oPtLV~HZ=JfUNJ3SpC@UkT(M=^y@- zWkPm*&2Se1TTIc`Ua@%^RPRRYz~Fyr3d)L(S(6E1_ecd2ODC-(FbP5DIk^)O{a74! z-LdnjKo-t8`~nM<5-cFj>zP%u#cajSX z5Wkl=wvJQuTJx4G#JtWe;7an7Fim}a(Hmwzm!+%l?BGsvc9S3%^VFZDC4JYZ{8cCo z)+;z4(MW^nV%Wpruy(M#crz`E8Dd5~e;NBb_mIVYa_3Nqd#ix%pnLtUuu9Y(B_UaJ zq~!H8$Mjz|*6IdUVlG#P`D!NZnN*|sa zS-;R{9}KtYP2t<14{+dmL&gfgU$i4&93JfPpfDYt#|1KJCsOoh_ud8eW1t~d={zE( z91$(?EJ_NpSrx+MMp+eJzV@v7PQn_J=o*q-+@nAsc{RAH%dPw-so-&rG~w0IWzV|~;+al{km!jx zz7|RNgq|V|GfQQFuF{?{&S~6XyANtJc0>>eRfVZzfyRZC(GXkef!2COmgK}Nyo5iy zW0|t8qC`cV(Zr~0Y~`}KFjp^U#+|E3f5F9T(Z})DuJ3r449R`CbF=1!$gCUi8N=ve zwNdfP!GJ61kH8B;GhV_C3J0?*F8|Cn29o0Yssv=7`_;x`bf7nL1`R(ekcxf_1=s81a+>2H|p-63@Vq}9nEB$f%Lt$_8e%yxCH-E}(2&3Fz>Y|M%LItQog^`ocW2@2S? zfUB{xfM^UjbHt)Q_qm=oWNnEczb287q?1eq3MzzTA`22@&boM@g+kX%-AhyZNt0wo*cTTRpXF_C9(n zn;07!I@nIF#;s2@fw`#)qT>lz8C=j!^NRgy*LQR(%5s%Rnx$TIaN#AAg-m53rB*L$ zyUbbW?HE=ODc=IvdeW$4nA5`jA`jJmrRP>w?{@fvBlPjB@Egww`(KH`ME0|NAZ9FR zx$rT9=3TNX0Bxk|@LOvL_D_>WSbXe4}-JG)na*?=cTir8jk{WS&75gzK(!?;p zkQD1FDKBjYyK!3kYKUfQz6DO3h=g)c9~7r|wgm8dyw1pw#G8IHdMEyGM8QAg0Xhzd zec>PEKmrU5;j<7iH)S?AaWi2uHw9)ZI-*PCjC%ISDk!Lp(~8Mh`;tRf+;;L$RN7!@ zXjH9^X&B6)76j?W7H+N+EnBkS-{*!H`^ey*0hl4G*MhdE@bE`zd+$R)GDM z6MoR<@=ft)aAKfq9LpkzeHIAInhq6roi{#UpxTd&}DE^RXFL4*Mwa1o7-olEzcP3CvNTP;&9)i7FO7B+d0VwFT`S6R`D4G%J&0cRdv$JJ zr-3;q(NuD=?s;?$RizK+UUJbF?p^oBkLe|VOTQ=3$aLskb?ujvUd*2wPX{F}Y0J7@ z4b7d-yz%$QvHn7j_YEInd^accT0qZ>m$}x)bMD&m9gc}_9jP^`ki1WvHwlG>g&*NR zN2cWG$^#eDLW_XNHNEv;7nD_3F5MTdhgj%rHTFLe#)LycLXJdh4^##4-krVQ9?FGj zXso|tjs&XA5+r;aT+&4MCg6o0V1L|POPF+UdAd0^3q1^mKj{|GdzWBVq>6e=w=72` z)xTpee{7F}dgn6M`uDjHb>GFj-yDauiXY|X5u5(x@OhN{z8qB^Ja{lt3dkRYK9mfu zdNu=_am%;(u0vDs)c+Hc?#%3O6!Z7ZJU(3JWok28s0qoRR(Xg6xjE<7GGlJi=pnT` zBKQueXy46Q0vI1Z3JViIzCc4y9X~osAea0#*^g1V6|porIwr^^?4ttb?0`nHF(SH|QuLYa!4br6r>=2uhDV8o47WOFv)J()S5Q;@nh>9U`0hVneJ`K1c&vG32a@p zp|||9ghpJ7;n>@_j^L2n-}TOY^swvzVlY~FG#VNXtt za#c*NKBq}p(=_?P4t~_4jMM>ntLkDzfs?8g#TUog{>~*`Rj3s7xcds>zi&fMPrL*A^ zUQ1r?pVRDMS({5ttc)+`y{<%FX$rON#*ontE)*CX;rE&B>n3@RLd{_lwY4T1Gv{lUOWgK0k5{}+$-KS%%n zwC#V7_W4A0{SV!%A`kr!3kC-Hc~^dx9-Lrc++d2{YTpz{v>aXROf4MDtVt9d%`NOn kOkB*YZ9FVUSeaPh0sm?kUCpd504A<{BrdM5ZjSK(2cwGW*#H0l literal 0 HcmV?d00001 diff --git a/crates/mysql/database/0.1.0.sql b/crates/mysql/version/0.1.0.sql similarity index 100% rename from crates/mysql/database/0.1.0.sql rename to crates/mysql/version/0.1.0.sql