diff --git a/.gitignore b/.gitignore index daab55e..869df07 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ -/public /target Cargo.lock \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index ea88211..e510071 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,24 +1,8 @@ -[package] -name = "rssto" -version = "0.2.2" -edition = "2024" -license = "MIT" -readme = "README.md" -description = "Convert RSS feeds into multiple formats" -keywords = ["rss", "aggregator", "conversion", "html", "gemtext"] -categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"] -repository = "https://github.com/YGGverse/rssto" - -[dependencies] -anyhow = "1.0" -chrono = "^0.4.20" -clap = { version = "4.5", features = ["derive"] } -log = "0.4" -regex = "1.12" -reqwest = { version = "0.12", features = ["blocking"] } -rss = "2.0" -serde = { version = "1.0", features = ["derive"] } -strip-tags = "0.1" -toml = "0.9" -tracing-subscriber = { version = "0.3", features = ["env-filter"] } -url = "2.5" +[workspace] +resolver = "2" +members = [ + "crates/crawler", + "crates/http", + "crates/llm", + "crates/mysql", +] \ No newline at end of file diff --git a/README.md b/README.md index 10937a0..ef2f5d3 100644 --- a/README.md +++ b/README.md @@ -4,70 +4,14 @@ [![Dependencies](https://deps.rs/repo/github/YGGverse/rssto/status.svg)](https://deps.rs/repo/github/YGGverse/rssto) [![crates.io](https://img.shields.io/crates/v/rssto.svg)](https://crates.io/crates/rssto) -Convert RSS feeds into multiple formats +Crawl content from RSS feeds into multiple formats -## Features +> [!NOTE] +> Branch in development! -* [x] Multiple feed sources with flexible TOML config options - * [x] Limit channel items - * [x] Format time - * [x] Multiple export format definition -* [x] Custom templates -* [x] Single export or daemon mode with update time -* [x] Export formats: - * [x] HTML - * [x] [Gemtext](https://geminiprotocol.net/docs/gemtext.gmi) +## Components -## Install - -``` bash -cargo install rssto -``` - -## Launch - -``` bash -rssto -c config/example.toml -``` -> [!TIP] -> * prepend `RUST_LOG=DEBUG` to print worker details (supported [levels](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.LevelFilter.html)) -> * append `-u TIME` to run as the daemon with `TIME` interval update -> * see `rssto --help` to print all available options - -### Systemd - -1. Install `rssto` by copy the binary compiled into the native system apps destination: - * Linux: `sudo install /home/user/.cargo/bin/rssto /usr/local/bin/rssto` -2. Create `systemd` configuration file at `/etc/systemd/system/rssto.service`: - -``` rssto.service -[Unit] -After=network-online.target -Wants=network-online.target - -[Service] -Type=simple - -User=rssto -Group=rssto - -# Uncomment for debug -# Environment="RUST_LOG=DEBUG" -# Environment="NO_COLOR=1" - -ExecStart=/usr/local/bin/rssto -c /path/to/config.toml - -StandardOutput=file:///home/rssto/debug.log -StandardError=file:///home/rssto/error.log - -[Install] -WantedBy=multi-user.target -``` -* example above requires new system user (`useradd -m rssto`) - -3. Run in priority: - - * `systemctl daemon-reload` - reload systemd configuration - * `systemctl enable rssto` - enable new service - * `systemctl start rssto` - start the process - * `systemctl status rssto` - check process launched +* `rssto-crawler` - RSS feed reader and data scrapper daemon +* `rssto-http` - Web server implementation based on the Rocket engine +* `rssto-llm` - Feeds auto-translation +* `rssto-mysql` - Shared database library \ No newline at end of file diff --git a/config/example.toml b/config/example.toml deleted file mode 100644 index 6793e42..0000000 --- a/config/example.toml +++ /dev/null @@ -1,19 +0,0 @@ -update = 60 - -[[feed]] -url = "https://assets.censor.net/rss/censor.net/rss_uk_news.xml" -storage = "./public/censor.net/rss_uk_news" -templates = ["./template/html","./template/gmi"] -list_items_limit = 20 -pub_date_format = "%Y/%m/%d %H:%M:%S %z" -last_build_date_format = "%Y/%m/%d %H:%M:%S %z" -time_generated_format = "%Y/%m/%d %H:%M:%S %z" - -[[feed]] -url = "https://assets.censor.net/rss/censor.net/rss_uk_resonance.xml" -storage = "./public/censor.net/rss_uk_resonance" -templates = ["./template/html","./template/gmi"] -list_items_limit = 20 -pub_date_format = "%Y/%m/%d %H:%M:%S %z" -last_build_date_format = "%Y/%m/%d %H:%M:%S %z" -time_generated_format = "%Y/%m/%d %H:%M:%S %z" \ No newline at end of file diff --git a/crates/crawler/Cargo.toml b/crates/crawler/Cargo.toml new file mode 100644 index 0000000..f6275eb --- /dev/null +++ b/crates/crawler/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "rssto-crawler" +version = "0.1.0" +edition = "2024" +license = "MIT" +readme = "README.md" +description = "Crawl RSS feeds into MySQL database" +keywords = ["rss", "aggregator", "conversion", "mysql", "crawler"] +categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"] +repository = "https://github.com/YGGverse/rssto" + +[dependencies] +ammonia = "4.1.2" +anyhow = "1.0.100" +chrono = "0.4.42" +clap = { version = "4.5.54", features = ["derive"] } +log = "0.4.29" +mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transaction"], path = "../mysql" } +reqwest = { version = "0.13.1", features = ["blocking"] } +rss = "2.0.12" +scraper = { version = "0.25.0", features = ["serde"] } +serde = { version = "1.0.228", features = ["derive"] } +sha2 = "0.10.9" +toml = "0.9.10" +tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } +url = { version = "2.5.8", features = ["serde"] } \ No newline at end of file diff --git a/crates/crawler/LICENSE b/crates/crawler/LICENSE new file mode 100644 index 0000000..a9c0006 --- /dev/null +++ b/crates/crawler/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 YGGverse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/crawler/README.md b/crates/crawler/README.md new file mode 100644 index 0000000..99fa260 --- /dev/null +++ b/crates/crawler/README.md @@ -0,0 +1,58 @@ +# rssto-crawler + +## Install + +``` bash +git clone https://github.com/YGGverse/rssto.git +cd rssto +cargo build --release +``` + +## Launch + +``` bash +rssto-crawler -c config/example.toml +``` +> [!TIP] +> * prepend `RUST_LOG=rssto_crawler=trace` to print worker details (supported [levels](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.LevelFilter.html)) +> * or just `RUST_LOG=trace` to debug all components in use +> * append `-u TIME` to run as the daemon with `TIME` interval update +> * see `rssto-crawler --help` to print all available options + +### Systemd + +1. Install `rssto-crawler` by copy the binary compiled into the native system apps destination: + * Linux: `sudo install target/release/rssto-crawler /usr/local/bin/rssto-crawler` +2. Create `systemd` configuration file at `/etc/systemd/system/rssto-crawler.service`: + +``` rssto-crawler.service +[Unit] +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple + +User=rssto +Group=rssto + +# Uncomment for debug +# Environment="RUST_LOG=rssto_crawler=debug" +# Environment="NO_COLOR=1" + +ExecStart=/usr/local/bin/rssto-crawler -c /path/to/config.toml + +StandardOutput=file:///home/rssto/crawler-debug.log +StandardError=file:///home/rssto/crawler-error.log + +[Install] +WantedBy=multi-user.target +``` +* example above requires new system user (`useradd -m rssto`) + +3. Run in priority: + +* `systemctl daemon-reload` - reload systemd configuration +* `systemctl enable rssto-crawler` - enable new service +* `systemctl start rssto-crawler` - start the process +* `systemctl status rssto-crawler` - check process launched \ No newline at end of file diff --git a/crates/crawler/config.toml b/crates/crawler/config.toml new file mode 100644 index 0000000..7c2cb2b --- /dev/null +++ b/crates/crawler/config.toml @@ -0,0 +1,80 @@ +# Rescan feed channels time, in seconds +update = 900 + +# Database connection setup +# * see crates/mysql/database +[mysql] + + host = "localhost" + port = 3306 + username = "" + password = "" + database = "rssto" + +# Content sources (unlimited) +[[channel]] + + # RSS feed source + url = "https://1" + + # Limit latest channel items to crawl (unlimited by default) + items_limit = 5 + + # Save Channel `title` and `description` in the database (currently not in use) + persist_description = true + + # Save Channel item `title` and `description` in the database + persist_item_description = true + + # Allowed tags + # * empty to strip all tags (default) + allowed_tags = ["a", "br", "p", "img"] + + # Grab Channel item content (from the item `link`) + scrape_item_content = false + + # Scrape title by CSS selector + # * None to use Channel item title if exists or fail to continue + # scrape_item_content_title_selector = "h1" + + # Scrape description by CSS selector + # * None to use Channel item description if exists or fail to continue + # scrape_item_content_description_selector = "article" + + # Preload content images locally if `Some` + # * currently stored in the database + # persist_images_selector = "img" + + +[[channel]] + + # RSS feed source + url = "https://2" + + # Limit latest channel items to crawl (unlimited by default) + items_limit = 5 + + # Save Channel `title` and `description` in the database (currently not in use) + persist_description = true + + # Save Channel item `title` and `description` in the database + persist_item_description = true + + # Allowed tags + # * empty to strip all tags (default) + allowed_tags = ["a", "br", "p", "img"] + + # Grab Channel item content (from the item `link`) + scrape_item_content = false + + # Scrape title by CSS selector + # * None to use Channel item title if exists or fail to continue + # scrape_item_content_title_selector = "h1" + + # Scrape description by CSS selector + # * None to use Channel item description if exists or fail to continue + # scrape_item_content_description_selector = "article" + + # Preload content images locally if `Some` + # * currently stored in the database + # persist_images_selector = "img" diff --git a/src/argument.rs b/crates/crawler/src/argument.rs similarity index 86% rename from src/argument.rs rename to crates/crawler/src/argument.rs index 5443edd..3894dd5 100644 --- a/src/argument.rs +++ b/crates/crawler/src/argument.rs @@ -6,7 +6,7 @@ use std::path::PathBuf; pub struct Argument { /// Path to config file /// - /// * see `config/example.toml` + /// * see `config.toml` #[arg(short, long)] pub config: PathBuf, } diff --git a/crates/crawler/src/config.rs b/crates/crawler/src/config.rs new file mode 100644 index 0000000..63fe5a5 --- /dev/null +++ b/crates/crawler/src/config.rs @@ -0,0 +1,47 @@ +use scraper::Selector; +use serde::Deserialize; +use url::Url; + +#[derive(Debug, Deserialize)] +pub struct Mysql { + pub database: String, + pub host: String, + pub password: String, + pub port: u16, + pub username: String, +} + +#[derive(Debug, Deserialize)] +pub struct Channel { + /// RSS feed source + pub url: Url, + /// Limit latest channel items to crawl (unlimited by default) + pub items_limit: Option, + /// Save Channel title and description in the database + pub persist_description: bool, + /// Save Channel item title and description in the database + pub persist_item_description: bool, + /// Grab Channel item content (from the item `link`) + pub scrape_item_content: bool, + /// Scrape title by CSS selector + /// * None to use Channel item title if exists or fail to continue + pub scrape_item_content_title_selector: Option, + /// Scrape description by CSS selector + /// * None to use Channel item description if exists or fail to continue + pub scrape_item_content_description_selector: Option, + /// Allowed tags + /// * empty to strip all tags (default) + pub allowed_tags: std::collections::HashSet, + /// Preload content images locally if `Some` + /// * currently stored in the database + pub persist_images_selector: Option, +} + +#[derive(Debug, Deserialize)] +pub struct Config { + pub mysql: Mysql, + pub channel: Vec, + /// Channels update timeout in seconds + /// * None to generate once + pub update: Option, +} diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs new file mode 100644 index 0000000..cb6b8d2 --- /dev/null +++ b/crates/crawler/src/main.rs @@ -0,0 +1,219 @@ +mod argument; +mod config; + +use anyhow::{Result, bail}; +use log::{debug, info, warn}; +use reqwest::blocking::get; +use url::Url; + +fn main() -> Result<()> { + use chrono::Local; + use clap::Parser; + use std::{env::var, fs::read_to_string}; + + if var("RUST_LOG").is_ok() { + use tracing_subscriber::{EnvFilter, fmt::*}; + struct T; + impl time::FormatTime for T { + fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result { + write!(w, "{}", Local::now()) + } + } + fmt() + .with_timer(T) + .with_env_filter(EnvFilter::from_default_env()) + .init() + } + + let argument = argument::Argument::parse(); + let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?; + let db = mysql::Database::pool( + &config.mysql.host, + config.mysql.port, + &config.mysql.username, + &config.mysql.password, + &config.mysql.database, + )?; + + info!("Crawler started"); + loop { + debug!("Begin new crawl queue..."); + for c in &config.channel { + debug!("Update `{}`...", c.url); + let mut tx = db.transaction()?; + match crawl(&mut tx, c) { + Ok(()) => tx.commit()?, + Err(e) => { + warn!("Channel `{}` update failed: `{e}`", c.url); + tx.rollback()? + } + } + } + debug!("Crawl queue completed"); + if let Some(update) = config.update { + debug!("Wait {update} seconds to continue...",); + std::thread::sleep(std::time::Duration::from_secs(update)) + } else { + return Ok(()); + } + } +} + +fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> { + use std::collections::HashSet; + + /// Removes all tags from `html` excluding `allowed_tags` or all if None + fn strip_tags(html: &str, allowed_tags: Option<&HashSet>) -> String { + ammonia::Builder::new() + .tags(allowed_tags.map_or(HashSet::new(), |a| a.iter().map(|t| t.as_str()).collect())) + .clean(html) + .to_string() + } + + let channel_url = channel_config.url.to_string(); // allocate once + + let channel_id = match tx.channel_id_by_url(&channel_url)? { + Some(channel_id) => channel_id, + None => { + let channel_id = tx.insert_channel(&channel_url)?; + info!("Register new channel #{channel_id} ({channel_url})"); + channel_id + } + }; + + let channel_items = + match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) { + Ok(channel) => { + if channel_config.persist_description { + let channel_description_id = tx.insert_channel_description( + channel_id, + None, + Some(strip_tags(channel.title(), None)), + Some(strip_tags( + channel.description(), + Some(&channel_config.allowed_tags), + )), + )?; + debug!("Save channel description #{channel_description_id}") + } + channel.into_items() + } + Err(e) => bail!("Could not parse response: `{e}`"), + }; + + let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len()); + + for channel_item in channel_items.iter().take(channel_items_limit) { + let guid = match channel_item.guid { + Some(ref guid) => guid.value.as_ref(), + None => bail!("Undefined `guid` field"), + }; + let (link, base) = match channel_item.link { + Some(ref link) => (link, Url::parse(link)?), + None => bail!("Undefined `link` field"), + }; + let pub_date = match channel_item.pub_date { + Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) { + Ok(t) => t.timestamp(), + Err(e) => bail!("Invalid `pub_date` field: `{e}`"), + }, + None => bail!("Undefined `pub_date`"), + }; + if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 { + debug!("Channel item `{guid}` already exists, skipped."); + continue; // skip next steps as processed + } + let channel_item_id = tx.insert_channel_item(channel_id, pub_date, guid, link)?; + info!("Register new channel item #{channel_item_id} ({link})"); + if channel_config.persist_item_description { + let channel_item_description_id = tx.insert_channel_item_description( + channel_item_id, + None, + channel_item.title().map(|s| strip_tags(s, None)), + channel_item + .description() + .map(|s| strip_tags(s, Some(&channel_config.allowed_tags))), + )?; + debug!("Save channel item description #{channel_item_description_id}") + } + // preload remote content.. + if !channel_config.scrape_item_content { + continue; + } + let channel_item_content_id = tx.insert_channel_item_content(channel_item_id)?; + info!("Add new content record #{channel_item_content_id}"); + + let html = scraper::Html::parse_document(&get(link)?.text()?); + let description = match channel_config.scrape_item_content_description_selector { + Some(ref selector) => match html.select(selector).next() { + Some(description) => Some(strip_tags( + &description.inner_html(), + Some(&channel_config.allowed_tags), + )), + None => bail!("Could not scrape `description` selector from `{link}`"), + }, + None => None, + }; + let channel_item_content_description_id = tx.insert_channel_item_content_description( + channel_item_content_id, + None, + match channel_config.scrape_item_content_title_selector { + Some(ref selector) => match html.select(selector).next() { + Some(title) => Some(strip_tags(&title.inner_html(), None)), + None => bail!("Could not scrape `title` selector from `{link}`"), + }, + None => None, + } + .as_ref() + .map(|s| s.trim()), + description.as_ref().map(|s| s.trim()), + )?; + debug!("Save channel item content description #{channel_item_content_description_id}"); + // persist images if enabled + if let Some(ref selector) = channel_config.persist_images_selector { + use sha2::{Digest, Sha256}; + if description.is_none() { + bail!("Field `description` is required to scrape images from `{link}`") + } + for element in scraper::Html::parse_document(&description.unwrap()).select(selector) { + if let Some(src) = element.value().attr("src") { + let absolute = match Url::parse(src) { + Ok(url) => url, + Err(e) => { + if e == url::ParseError::RelativeUrlWithoutBase { + let absolute = base.join(link)?; + debug!("Convert relative image link `{link}` to `{absolute}`"); + absolute + } else { + bail!("Could not parse URL from img source: `{e}`") + } + } + }; + let url = absolute.as_str(); + let data = get(url)?.bytes()?; + let hash = format!("{:x}", Sha256::digest(&data)); + + let image_id = match tx.image_id_by_sha256(&hash)? { + Some(image_id) => image_id, + None => { + let image_id = tx.insert_image(&hash, Some(src), Some(url), &data)?; + info!("Persist new image #{image_id} (`{absolute}`)"); + image_id + } + }; + let channel_item_content_image_id = + tx.insert_channel_item_content_image(channel_item_content_id, image_id)?; + debug!("Add content image relationship #{channel_item_content_image_id}"); + let uri = format!("/image/{image_id}"); + tx.replace_channel_item_content_description( + channel_item_content_description_id, + src, + &uri, + )?; + debug!("Replace content image in description from `{src}` to `{uri}`") + } + } + } + } + Ok(()) +} diff --git a/crates/http/Cargo.toml b/crates/http/Cargo.toml new file mode 100644 index 0000000..e5b8c8c --- /dev/null +++ b/crates/http/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "rssto-http" +version = "0.1.0" +edition = "2024" +license = "MIT" +readme = "README.md" +description = "Web server for the rssto DB, based on Rocket engine" +keywords = ["rss", "aggregator", "http", "server"] +categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"] +repository = "https://github.com/YGGverse/rssto" + +[dependencies] +chrono = { version = "0.4.41", features = ["serde"] } +clap = { version = "4.5.54", features = ["derive"] } +mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" } +rocket = "0.5.1" +rocket_dyn_templates = { version = "0.2.0", features = ["tera"] } +serde = { version = "1.0.228", features = ["derive"] } +toml = "0.9.10" \ No newline at end of file diff --git a/crates/http/LICENSE b/crates/http/LICENSE new file mode 100644 index 0000000..a9c0006 --- /dev/null +++ b/crates/http/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 YGGverse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/http/README.md b/crates/http/README.md new file mode 100644 index 0000000..e9b1b42 --- /dev/null +++ b/crates/http/README.md @@ -0,0 +1,11 @@ +# rssto-http + +Web server implementation based on the Rocket engine + +> [!NOTE] +> In development! + +``` +cd rssto/crates/rssto-http +cargo run -- -c /path/to/config.toml +``` \ No newline at end of file diff --git a/crates/http/config.toml b/crates/http/config.toml new file mode 100644 index 0000000..0e81ccb --- /dev/null +++ b/crates/http/config.toml @@ -0,0 +1,29 @@ +title = "rssto" +#description = "" + +format_time = "%d/%m/%Y %H:%M" + +# Provider ID (`provider` table) +# * None for the original content +# provider_id = 1 + +# Default listing limit +list_limit = 20 + +# Bind server on given host +host = "127.0.0.1" + +# Bind server on given port +port = 8000 + +#Configure instance in the debug mode +debug = true + +# Database connection setup +# * see crates/mysql/database +[mysql] +host = "localhost" +port = 3306 +username = "" +password = "" +database = "rssto" \ No newline at end of file diff --git a/crates/http/src/argument.rs b/crates/http/src/argument.rs new file mode 100644 index 0000000..3894dd5 --- /dev/null +++ b/crates/http/src/argument.rs @@ -0,0 +1,12 @@ +use clap::Parser; +use std::path::PathBuf; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +pub struct Argument { + /// Path to config file + /// + /// * see `config.toml` + #[arg(short, long)] + pub config: PathBuf, +} diff --git a/crates/http/src/config.rs b/crates/http/src/config.rs new file mode 100644 index 0000000..a6068dd --- /dev/null +++ b/crates/http/src/config.rs @@ -0,0 +1,24 @@ +use serde::Deserialize; +use std::net::IpAddr; + +#[derive(Debug, Deserialize)] +pub struct Mysql { + pub database: String, + pub host: String, + pub password: String, + pub port: u16, + pub username: String, +} + +#[derive(Debug, Deserialize)] +pub struct Config { + pub mysql: Mysql, + pub title: String, + pub description: Option, + pub format_time: String, + pub provider_id: Option, + pub list_limit: usize, + pub host: IpAddr, + pub port: u16, + pub debug: bool, +} diff --git a/crates/http/src/feed.rs b/crates/http/src/feed.rs new file mode 100644 index 0000000..73d582e --- /dev/null +++ b/crates/http/src/feed.rs @@ -0,0 +1,58 @@ +/// Export crawl index to the RSS file +pub struct Feed { + buffer: String, +} + +impl Feed { + pub fn new(title: &str, description: Option<&str>, capacity: usize) -> Self { + let t = chrono::Utc::now().to_rfc2822(); + let mut buffer = String::with_capacity(capacity); + + buffer.push_str(""); + + buffer.push_str(&format!("{t}")); + buffer.push_str(&format!("{t}")); + buffer.push_str(&format!("{}", escape(title))); + + if let Some(d) = description { + buffer.push_str(&format!("{}", escape(d))); + } + + Self { buffer } + } + + /// Append `item` to the feed `channel` + pub fn push( + &mut self, + guid: u64, + time: chrono::DateTime, + url: String, + title: String, + description: String, + ) { + self.buffer.push_str(&format!( + "{guid}{}{url}{}{}", + escape(&title), + escape(&description), + time.to_rfc2822() + )) + } + + /// Write final bytes + pub fn commit(mut self) -> String { + self.buffer.push_str(""); + self.buffer + } +} + +// @TODO use tera filters? +// https://keats.github.io/tera/docs/#built-in-filters + +fn escape(value: &str) -> String { + value + .replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) + .replace("'", "'") +} diff --git a/crates/http/src/global.rs b/crates/http/src/global.rs new file mode 100644 index 0000000..8e25ad0 --- /dev/null +++ b/crates/http/src/global.rs @@ -0,0 +1,9 @@ +use rocket::serde::Serialize; + +#[derive(Clone, Debug, Serialize)] +#[serde(crate = "rocket::serde")] +pub struct Global { + pub format_time: String, + pub list_limit: usize, + pub provider_id: Option, +} diff --git a/crates/http/src/main.rs b/crates/http/src/main.rs new file mode 100644 index 0000000..70f8a50 --- /dev/null +++ b/crates/http/src/main.rs @@ -0,0 +1,309 @@ +#[macro_use] +extern crate rocket; + +mod argument; +mod config; +mod feed; +mod global; +mod meta; + +use chrono::{DateTime, Utc}; +use feed::Feed; +use global::Global; +use meta::Meta; +use mysql::{Database, table::Sort}; +use rocket::{ + State, + http::{ContentType, Status}, + response::content::RawXml, + serde::Serialize, +}; +use rocket_dyn_templates::{Template, context}; + +#[get("/?&")] +fn index( + search: Option<&str>, + page: Option, + db: &State, + meta: &State, + global: &State, +) -> Result { + #[derive(Serialize)] + #[serde(crate = "rocket::serde")] + struct Row { + channel_item_content_description_id: u64, + link: String, + time: String, + title: String, + } + let mut conn = db.connection().map_err(|e| { + error!("Could not connect database: `{e}`"); + Status::InternalServerError + })?; + let total = conn + .channel_item_content_descriptions_total_by_provider_id(global.provider_id, search) + .map_err(|e| { + error!("Could not get contents total: `{e}`"); + Status::InternalServerError + })?; + Ok(Template::render( + "index", + context! { + title: { + let mut t = String::with_capacity(9); + if let Some(q) = search && !q.is_empty() { + t.push_str(q); + t.push_str(S); + t.push_str("Search"); + t.push_str(S) + } + if let Some(p) = page && p > 1 { + t.push_str(&format!("Page {p}")); + t.push_str(S) + } + t.push_str(&meta.title); + if let Some(ref description) = meta.description + && page.is_none_or(|p| p == 1) && search.is_none_or(|q| q.is_empty()) { + t.push_str(S); + t.push_str(description) + } + t + }, + meta: meta.inner(), + back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))), + next: if page.unwrap_or(1) * global.list_limit >= total { None } + else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) }, + rows: conn.channel_item_content_descriptions_by_provider_id( + global.provider_id, + search, + Sort::Desc, + page.map(|p| if p > 1 { p - 1 } else { 1 } * global.list_limit), + Some(global.list_limit) + ).map_err(|e| { + error!("Could not get contents: `{e}`"); + Status::InternalServerError + })? + .into_iter() + .map(|channel_item_content_description| { + let channel_item = conn.channel_item( + channel_item_content_description.channel_item_content_id + ).unwrap().unwrap(); + Row { + channel_item_content_description_id: + channel_item_content_description.channel_item_content_description_id, + link: channel_item.link, + time: time(channel_item.pub_date).format(&global.format_time).to_string(), + title: channel_item_content_description.title.unwrap_or_default(), // @TODO handle + } + }) + .collect::>(), + page: page.unwrap_or(1), + pages: (total as f64 / global.list_limit as f64).ceil(), + total, + search + }, + )) +} + +#[get("/")] +fn info( + channel_item_content_description_id: u64, + db: &State, + meta: &State, + global: &State, +) -> Result { + let mut conn = db.connection().map_err(|e| { + error!("Could not connect database: `{e}`"); + Status::InternalServerError + })?; + match conn.channel_item_content_description(channel_item_content_description_id).map_err(|e| { + error!("Could not get `channel_item_content_description_id` {channel_item_content_description_id}: `{e}`"); + Status::InternalServerError + })? { + Some(channel_item_content_description) => { + let channel_item_content = conn + .channel_item_content(channel_item_content_description.channel_item_content_id) + .map_err(|e| { + error!( + "Could not get requested `channel_item_content` #{}: `{e}`", + channel_item_content_description.channel_item_content_id + ); + Status::InternalServerError + })? + .ok_or_else(|| { + error!( + "Could not find requested `channel_item_content` #{}", + channel_item_content_description.channel_item_content_id + ); + Status::NotFound + })?; + let channel_item = conn + .channel_item(channel_item_content.channel_item_id) + .map_err(|e| { + error!( + "Could not get requested `channel_item` #{}: `{e}`", + channel_item_content.channel_item_id + ); + Status::InternalServerError + })? + .ok_or_else(|| { + error!( + "Could not find requested `channel_item` #{}", + channel_item_content.channel_item_id + ); + Status::NotFound + })?; + let title = channel_item_content_description.title.unwrap_or_default(); // @TODO handle + Ok(Template::render( + "info", + context! { + description: channel_item_content_description.description, + link: channel_item.link, + meta: meta.inner(), + title: format!("{title}{S}{}", meta.title), + name: title, + time: time(channel_item.pub_date).format(&global.format_time).to_string(), + }, + )) + } + None => Err(Status::NotFound), + } +} + +#[get("/image/")] +fn image(image_id: u64, db: &State) -> Result<(ContentType, Vec), Status> { + let mut conn = db.connection().map_err(|e| { + error!("Could not connect database: `{e}`"); + Status::InternalServerError + })?; + match conn.image(image_id).map_err(|e| { + error!("Could not get content image `{image_id}`: `{e}`"); + Status::InternalServerError + })? { + Some(image) => Ok((ContentType::Bytes, image.data)), + None => Err(Status::NotFound), + } +} + +#[get("/rss?")] +fn rss( + search: Option<&str>, + global: &State, + meta: &State, + db: &State, +) -> Result, Status> { + let mut feed = Feed::new( + &meta.title, + meta.description.as_deref(), + 1024, // @TODO + ); + let mut conn = db.connection().map_err(|e| { + error!("Could not connect database: `{e}`"); + Status::InternalServerError + })?; + for channel_item_content_description in conn + .channel_item_content_descriptions_by_provider_id( + global.provider_id, + search, + Sort::Desc, + None, + Some(global.list_limit), + ) + .map_err(|e| { + error!( + "Could not load `channel_item_content_description` for `provider` #{:?}: `{e}`", + global.provider_id + ); + Status::InternalServerError + })? + { + let channel_item_content = conn + .channel_item_content(channel_item_content_description.channel_item_content_id) + .map_err(|e| { + error!( + "Could not get requested `channel_item_content` #{}: `{e}`", + channel_item_content_description.channel_item_content_id + ); + Status::InternalServerError + })? + .ok_or_else(|| { + error!( + "Could not find requested `channel_item_content` #{}", + channel_item_content_description.channel_item_content_id + ); + Status::NotFound + })?; + let channel_item = conn + .channel_item(channel_item_content.channel_item_id) + .map_err(|e| { + error!( + "Could not get requested `channel_item` #{}: `{e}`", + channel_item_content.channel_item_id + ); + Status::InternalServerError + })? + .ok_or_else(|| { + error!( + "Could not find requested `channel_item` #{}", + channel_item_content.channel_item_id + ); + Status::NotFound + })?; + feed.push( + channel_item_content_description.channel_item_content_description_id, + time(channel_item.pub_date), + channel_item.link, + channel_item_content_description.title.unwrap_or_default(), // @TODO handle + channel_item_content_description + .description + .unwrap_or_default(), // @TODO handle + ) + } + Ok(RawXml(feed.commit())) +} + +#[launch] +fn rocket() -> _ { + use clap::Parser; + let argument = argument::Argument::parse(); + let config: config::Config = + toml::from_str(&std::fs::read_to_string(argument.config).unwrap()).unwrap(); + rocket::build() + .attach(Template::fairing()) + .configure(rocket::Config { + port: config.port, + address: config.host, + ..if config.debug { + rocket::Config::debug_default() + } else { + rocket::Config::release_default() + } + }) + .manage( + Database::pool( + &config.mysql.host, + config.mysql.port, + &config.mysql.username, + &config.mysql.password, + &config.mysql.database, + ) + .unwrap(), + ) + .manage(Global { + format_time: config.format_time, + list_limit: config.list_limit, + provider_id: config.provider_id, + }) + .manage(Meta { + description: config.description, + title: config.title, + version: env!("CARGO_PKG_VERSION").into(), + }) + .mount("/", routes![index, rss, info, image]) +} + +const S: &str = " • "; + +fn time(timestamp: i64) -> DateTime { + DateTime::::from_timestamp(timestamp, 0).unwrap() +} diff --git a/crates/http/src/meta.rs b/crates/http/src/meta.rs new file mode 100644 index 0000000..c8512d0 --- /dev/null +++ b/crates/http/src/meta.rs @@ -0,0 +1,9 @@ +use rocket::serde::Serialize; + +#[derive(Clone, Debug, Serialize)] +#[serde(crate = "rocket::serde")] +pub struct Meta { + pub description: Option, + pub title: String, + pub version: String, +} diff --git a/crates/http/templates/index.html.tera b/crates/http/templates/index.html.tera new file mode 100644 index 0000000..4cef190 --- /dev/null +++ b/crates/http/templates/index.html.tera @@ -0,0 +1,21 @@ +{% extends "layout" %} +{% block content %} + {% if rows %} + {% for row in rows %} +
+ +

{{ row.title }}

+

{{ row.time }}

+
+ {% endfor %} + {% else %} +
+

Nothing.

+
+ {% endif %} + {% if next %}Next{% endif %} + {% if back %}Back{% endif %} + {% if total %} +

Page {{ page }} / {{ pages }} ({{ total }} total)

+ {% endif %} +{% endblock content %} \ No newline at end of file diff --git a/crates/http/templates/info.html.tera b/crates/http/templates/info.html.tera new file mode 100644 index 0000000..fd74623 --- /dev/null +++ b/crates/http/templates/info.html.tera @@ -0,0 +1,10 @@ +{% extends "layout" %} +{% block content %} +
+

{{ name }}

+

{{ time }}

+
+ {{ description | safe }} +
+
+{% endblock content %} \ No newline at end of file diff --git a/crates/http/templates/layout.html.tera b/crates/http/templates/layout.html.tera new file mode 100644 index 0000000..d10ca68 --- /dev/null +++ b/crates/http/templates/layout.html.tera @@ -0,0 +1,25 @@ + + + + + {{ title }} + {% if meta.description %} + + {% endif %} + + + +
+

{{ meta.title }}

+
+ + +
+
+
+ {% block content %}{% endblock content %} +
+ + \ No newline at end of file diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml new file mode 100644 index 0000000..6f62481 --- /dev/null +++ b/crates/llm/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "rssto-llm" +version = "0.1.0" +edition = "2024" +license = "MIT" +readme = "README.md" +description = "LLM daemon for the rssto DB translations" +keywords = ["rss", "llm", "translation", "localization", "server"] +categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"] +repository = "https://github.com/YGGverse/rssto" + +[dependencies] +anyhow = "1.0.100" +chrono = "0.4.42" +clap = { version = "4.5.54", features = ["derive"] } +lancor = "0.1.1" +log = "0.4.29" +mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transaction"], path = "../mysql" } +serde = { version = "1.0.228", features = ["derive"] } +tokio = { version = "1.0", features = ["full"] } +toml = "0.9.10" +tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } diff --git a/crates/llm/LICENSE b/crates/llm/LICENSE new file mode 100644 index 0000000..a9c0006 --- /dev/null +++ b/crates/llm/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 YGGverse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/llm/README.md b/crates/llm/README.md new file mode 100644 index 0000000..16cb13d --- /dev/null +++ b/crates/llm/README.md @@ -0,0 +1,22 @@ +# rssto-llm + +LLM daemon for the rssto DB translations + +> [!NOTE] +> In development! + +1. Setup `rssto-crawler` first and collect initial data + +2. Run LLM server: + +``` +llama-server -hf ggml-org/gemma-3-1b-it-GGUF +``` + +3. Launch `rssto-llm` to handle `content` DB: + +``` +cd rssto/crates/rssto-llm +cargo run -- -c /path/to/config.toml +``` +* see `--help` to display all supported options \ No newline at end of file diff --git a/crates/llm/config.toml b/crates/llm/config.toml new file mode 100644 index 0000000..eb9f926 --- /dev/null +++ b/crates/llm/config.toml @@ -0,0 +1,22 @@ +# Rescan database for new subjects, in seconds +# * process once if not defined +# update = 900 + +# Database connection setup +# * see crates/mysql/database +[mysql] +host = "localhost" +port = 3306 +username = "" +password = "" +database = "rssto" + +# LLM connection setup +[llm] +scheme = "http" +host = "127.0.0.1" +port = 8080 +# Model name +model = "ggml-org/gemma-3-1b-it-GGUF" +# Initial message for the `content` subject (e.g. `translate to...`) +message = "translate to english:" \ No newline at end of file diff --git a/crates/llm/src/argument.rs b/crates/llm/src/argument.rs new file mode 100644 index 0000000..3894dd5 --- /dev/null +++ b/crates/llm/src/argument.rs @@ -0,0 +1,12 @@ +use clap::Parser; +use std::path::PathBuf; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +pub struct Argument { + /// Path to config file + /// + /// * see `config.toml` + #[arg(short, long)] + pub config: PathBuf, +} diff --git a/crates/llm/src/config.rs b/crates/llm/src/config.rs new file mode 100644 index 0000000..9655ea6 --- /dev/null +++ b/crates/llm/src/config.rs @@ -0,0 +1,27 @@ +use serde::Deserialize; +use std::net::IpAddr; + +#[derive(Debug, Deserialize)] +pub struct Mysql { + pub database: String, + pub host: IpAddr, + pub password: String, + pub port: u16, + pub username: String, +} + +#[derive(Debug, Deserialize)] +pub struct Llm { + pub scheme: String, + pub host: IpAddr, + pub port: u16, + pub model: String, + pub message: String, +} + +#[derive(Debug, Deserialize)] +pub struct Config { + pub mysql: Mysql, + pub llm: Llm, + pub update: Option, +} diff --git a/crates/llm/src/main.rs b/crates/llm/src/main.rs new file mode 100644 index 0000000..7184303 --- /dev/null +++ b/crates/llm/src/main.rs @@ -0,0 +1,124 @@ +mod argument; +mod config; + +use anyhow::Result; +use mysql::Database; + +#[tokio::main] +async fn main() -> Result<()> { + use chrono::Local; + use clap::Parser; + use lancor::{ChatCompletionRequest, LlamaCppClient, Message}; + use log::{debug, info}; + + use std::env::var; + + if var("RUST_LOG").is_ok() { + use tracing_subscriber::{EnvFilter, fmt::*}; + struct T; + impl time::FormatTime for T { + fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result { + write!(w, "{}", Local::now()) + } + } + fmt() + .with_timer(T) + .with_env_filter(EnvFilter::from_default_env()) + .init() + } + + let argument = argument::Argument::parse(); + let config: config::Config = toml::from_str(&std::fs::read_to_string(argument.config)?)?; + + let llm = LlamaCppClient::new(format!( + "{}://{}:{}", + config.llm.scheme, config.llm.host, config.llm.port + ))?; + let db = Database::pool( + &config.mysql.host.to_string(), + config.mysql.port, + &config.mysql.username, + &config.mysql.password, + &config.mysql.database, + )?; + + let provider_id = { + let mut conn = db.connection()?; + match conn.provider_id_by_name(&config.llm.model)? { + Some(provider_id) => { + debug!( + "Use existing DB provider #{} matches model name `{}`", + provider_id, &config.llm.model + ); + provider_id + } + None => { + let provider_id = conn.insert_provider(&config.llm.model)?; + info!( + "Provider `{}` not found in database, created new one with ID `{provider_id}`", + &config.llm.model + ); + provider_id + } + } + }; + + info!("Daemon started"); + loop { + debug!("New queue begin..."); + let mut tx = db.transaction()?; + for channel_item_content_description in + tx.channel_item_content_descriptions_queue_for_provider_id(provider_id)? + { + debug!( + "Begin generating `channel_item_content_description` #{} using `provider_id` #{provider_id}.", + channel_item_content_description.channel_item_content_description_id + ); + let title = match channel_item_content_description.title { + Some(subject) => Some( + llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message( + Message::user(format!("{}\n{}", config.llm.message, subject)), + )) + .await? + .choices[0] + .message + .content + .trim() + .to_string(), + ), + None => None, + }; + let description = match channel_item_content_description.description { + Some(subject) => Some( + llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message( + Message::user(format!("{}\n{}", config.llm.message, subject)), + )) + .await? + .choices[0] + .message + .content + .trim() + .to_string(), + ), + None => None, + }; + let channel_item_content_description_id = tx.insert_channel_item_content_description( + channel_item_content_description.channel_item_content_id, + Some(provider_id), + title.as_deref(), + description.as_deref(), + )?; + info!( + "Create `channel_item_content_description` #{channel_item_content_description_id} by `provider_id` #{provider_id}." + ); + } + tx.commit()?; + debug!("Queue completed"); + if let Some(update) = config.update { + debug!("Wait {update} seconds to continue..."); + std::thread::sleep(std::time::Duration::from_secs(update)) + } else { + return Ok(()); + } + } +} diff --git a/crates/mysql/Cargo.toml b/crates/mysql/Cargo.toml new file mode 100644 index 0000000..0a151f3 --- /dev/null +++ b/crates/mysql/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "rssto-mysql" +version = "0.1.0" +edition = "2024" +license = "MIT" +readme = "README.md" +description = "Shared MySQL database library" +keywords = ["rssto", "database", "mysql", "library", "api"] +# categories = [] +repository = "https://github.com/YGGverse/rssto" + +[features] +default = [] +transaction = [] + +[dependencies] +mysql = "26.0.1" \ No newline at end of file diff --git a/crates/mysql/LICENSE b/crates/mysql/LICENSE new file mode 100644 index 0000000..a9c0006 --- /dev/null +++ b/crates/mysql/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 YGGverse + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/crates/mysql/README.md b/crates/mysql/README.md new file mode 100644 index 0000000..62e2c8e --- /dev/null +++ b/crates/mysql/README.md @@ -0,0 +1,6 @@ +# rssto-mysql + +Shared MySQL database library + +> [!TIP] +> See `database.mwb` model or `version` directory to deploy \ No newline at end of file diff --git a/crates/mysql/database.mwb b/crates/mysql/database.mwb new file mode 100644 index 0000000..c6dbc34 Binary files /dev/null and b/crates/mysql/database.mwb differ diff --git a/crates/mysql/src/connection.rs b/crates/mysql/src/connection.rs new file mode 100644 index 0000000..35c3469 --- /dev/null +++ b/crates/mysql/src/connection.rs @@ -0,0 +1,143 @@ +use crate::table::*; +use mysql::{Error, Pool, PooledConn, prelude::Queryable}; + +/// Safe, read-only operations used in client apps like `rssto-http` +pub struct Connection { + conn: PooledConn, +} + +impl Connection { + pub fn create(pool: &Pool) -> Result { + Ok(Self { + conn: pool.get_conn()?, + }) + } + + pub fn channel_item(&mut self, channel_item_id: u64) -> Result, Error> { + self.conn.exec_first( + "SELECT `channel_item_id`, + `channel_id`, + `pub_date`, + `guid`, + `link` FROM `channel_item` WHERE `channel_item_id` = ?", + (channel_item_id,), + ) + } + + pub fn channel_item_content( + &mut self, + channel_item_content_id: u64, + ) -> Result, Error> { + self.conn.exec_first( + "SELECT `channel_item_content_id`, + `channel_item_id` + FROM `channel_item_content` WHERE `channel_item_content_id` = ?", + (channel_item_content_id,), + ) + } + + pub fn channel_item_content_description( + &mut self, + channel_item_content_description_id: u64, + ) -> Result, Error> { + self.conn.exec_first( + "SELECT `channel_item_content_description_id`, + `channel_item_content_id`, + `provider_id`, + `title`, + `description` FROM `channel_item_content_description` + WHERE `channel_item_content_description_id` = ?", + (channel_item_content_description_id,), + ) + } + + pub fn channel_item_content_descriptions_total_by_provider_id( + &mut self, + provider_id: Option, + keyword: Option<&str>, + ) -> Result { + let total: Option = match keyword { + Some(k) => self.conn.exec_first( + "SELECT COUNT(*) FROM `channel_item_content_description` + WHERE `provider_id` <=> ? AND `title` LIKE '%?%'", + (provider_id, k), + )?, + None => self.conn.exec_first( + "SELECT COUNT(*) FROM `channel_item_content_description` + WHERE `provider_id` <=> ?", + (provider_id,), + )?, + }; + + Ok(total.unwrap_or(0)) + } + + pub fn channel_item_content_descriptions_by_provider_id( + &mut self, + provider_id: Option, + keyword: Option<&str>, + sort: Sort, + start: Option, + limit: Option, + ) -> Result, Error> { + match keyword { + Some(k) => self.conn.exec( + format!( + "SELECT `channel_item_content_description_id`, + `channel_item_content_id`, + `provider_id`, + `title`, + `description` + FROM `channel_item_content_description` + WHERE `provider_id` <=> ? AND `title` LIKE '%?%' + ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}", + start.unwrap_or(0), + limit.unwrap_or(DEFAULT_LIMIT) + ), + (provider_id, k), + ), + None => self.conn.exec( + format!( + "SELECT `channel_item_content_description_id`, + `channel_item_content_id`, + `provider_id`, + `title`, + `description` + FROM `channel_item_content_description` + WHERE `provider_id` <=> ? + ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}", + start.unwrap_or(0), + limit.unwrap_or(DEFAULT_LIMIT) + ), + (provider_id,), + ), + } + } + + pub fn image(&mut self, image_id: u64) -> Result, Error> { + self.conn.exec_first( + "SELECT `image_id`, + `provider_id`, + `sha256`, + `src`, + `url`, + `data` FROM `image` WHERE `image_id` = ?", + (image_id,), + ) + } + + pub fn provider_id_by_name(&mut self, name: &str) -> Result, Error> { + self.conn.exec_first( + "SELECT `provider_id` FROM `provider` WHERE `name` = ?", + (name,), + ) + } + + pub fn insert_provider(&mut self, name: &str) -> Result { + self.conn + .exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?; + Ok(self.conn.last_insert_id()) + } +} + +const DEFAULT_LIMIT: usize = 100; diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs new file mode 100644 index 0000000..53ef7d6 --- /dev/null +++ b/crates/mysql/src/lib.rs @@ -0,0 +1,36 @@ +mod connection; +pub mod table; +#[cfg(feature = "transaction")] +mod transaction; + +pub use connection::Connection; +#[cfg(feature = "transaction")] +pub use transaction::Transaction; +pub struct Database { + pool: mysql::Pool, +} + +impl Database { + pub fn pool( + host: &str, + port: u16, + user: &str, + password: &str, + database: &str, + ) -> Result { + Ok(Self { + pool: mysql::Pool::new( + format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(), + )?, + }) + } + + pub fn connection(&self) -> Result { + Connection::create(&self.pool) + } + + #[cfg(feature = "transaction")] + pub fn transaction(&self) -> Result { + Transaction::create(&self.pool) + } +} diff --git a/crates/mysql/src/table.rs b/crates/mysql/src/table.rs new file mode 100644 index 0000000..2c9218d --- /dev/null +++ b/crates/mysql/src/table.rs @@ -0,0 +1,74 @@ +use mysql::prelude::FromRow; + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct Channel { + pub channel_id: u64, + pub url: String, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ChannelItem { + pub channel_item_id: u64, + pub channel_id: u64, + pub pub_date: i64, + pub guid: String, + pub link: String, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ChannelItemDescription { + pub channel_item_description_id: u64, + pub channel_item_id: u64, + pub provider_id: Option, + pub title: Option, + pub description: Option, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ChannelItemContent { + pub channel_item_content_id: u64, + pub channel_item_id: u64, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct ChannelItemContentDescription { + pub channel_item_content_description_id: u64, + pub channel_item_content_id: u64, + pub provider_id: Option, + pub title: Option, + pub description: Option, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct Provider { + pub provider_id: u64, + pub name: String, +} + +#[derive(Debug, PartialEq, Eq, FromRow)] +pub struct Image { + pub image_id: u64, + pub provider_id: Option, + /// Keep image unique by comparing its data hash + pub sha256: String, + /// Original `src` tag value to post-replacing + pub src: Option, + /// Resolved absolute URL + pub url: Option, + /// Image data, MEDIUMBLOB (16M) + pub data: Vec, +} + +pub enum Sort { + Asc, + Desc, +} + +impl std::fmt::Display for Sort { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Self::Asc => write!(f, "ASC"), + Self::Desc => write!(f, "DESC"), + } + } +} diff --git a/crates/mysql/src/transaction.rs b/crates/mysql/src/transaction.rs new file mode 100644 index 0000000..a087ece --- /dev/null +++ b/crates/mysql/src/transaction.rs @@ -0,0 +1,195 @@ +use crate::table::*; +use mysql::{Error, Pool, TxOpts, prelude::Queryable}; + +/// Safe, optimized read/write operations +/// mostly required by the `rssto-crawler` and `rssto-llm` +/// * all members implementation requires `commit` action +pub struct Transaction { + tx: mysql::Transaction<'static>, +} + +impl Transaction { + pub fn create(pool: &Pool) -> Result { + Ok(Self { + tx: pool.start_transaction(TxOpts::default())?, + }) + } + + pub fn commit(self) -> Result<(), Error> { + self.tx.commit() + } + + pub fn rollback(self) -> Result<(), Error> { + self.tx.rollback() + } + + pub fn channel_id_by_url(&mut self, url: &str) -> Result, Error> { + self.tx.exec_first( + "SELECT `channel_id` FROM `channel` WHERE `url` = ? LIMIT 1", + (url,), + ) + } + + pub fn insert_channel(&mut self, url: &str) -> Result { + self.tx + .exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn insert_channel_description( + &mut self, + channel_id: u64, + provider_id: Option, + title: Option, + description: Option, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_description` SET `channel_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", + (channel_id, provider_id, title, description), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn channel_items_total_by_channel_id_guid( + &mut self, + channel_id: u64, + guid: &str, + ) -> Result { + Ok(self + .tx + .exec_first( + "SELECT COUNT(*) FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ?", + (channel_id, guid), + )? + .unwrap_or(0)) + } + + pub fn insert_channel_item( + &mut self, + channel_id: u64, + pub_date: i64, + guid: &str, + link: &str, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item` SET `channel_id` = ?, + `pub_date` = ?, + `guid` = ?, + `link` = ?", + (channel_id, pub_date, guid, link), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn insert_channel_item_description( + &mut self, + channel_item_id: u64, + provider_id: Option, + title: Option, + description: Option, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item_description` SET `channel_item_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", + (channel_item_id, provider_id, title, description), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn channel_item_content_descriptions_queue_for_provider_id( + &mut self, + provider_id: u64, + ) -> Result, Error> { + self.tx.exec( + "SELECT `t1`.`channel_item_content_description_id`, + `t1`.`channel_item_content_id`, + `t1`.`provider_id`, + `t1`.`title`, + `t1`.`description` + FROM `channel_item_content_description` AS `t1` + WHERE `t1`.`provider_id` IS NULL AND NOT EXISTS ( + SELECT NULL FROM `channel_item_content_description` AS `t2` + WHERE `t2`.`channel_item_content_description_id` = `t1`.`channel_item_content_description_id` + AND `t2`.`provider_id` = ? LIMIT 1 + )", + (provider_id,), + ) + } // @TODO upgrade to the latest version + + pub fn insert_channel_item_content(&mut self, channel_item_id: u64) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item_content` SET `channel_item_id` = ?", + (channel_item_id,), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn insert_channel_item_content_description( + &mut self, + channel_item_content_id: u64, + provider_id: Option, + title: Option<&str>, + description: Option<&str>, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item_content_description` SET `channel_item_content_id` = ?, + `provider_id` = ?, + `title` = ?, + `description` = ?", + (channel_item_content_id, provider_id, title, description), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn replace_channel_item_content_description( + &mut self, + channel_item_content_description_id: u64, + from: &str, + to: &str, + ) -> Result<(), Error> { + self.tx.exec_drop( + "UPDATE `channel_item_content_description` + SET `description` = REPLACE(`description`, ?, ?) + WHERE `channel_item_content_description_id` = ?", + (from, to, channel_item_content_description_id), + ) + } + + pub fn insert_channel_item_content_image( + &mut self, + channel_item_content_id: u64, + image_id: u64, + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `channel_item_content_image` SET `channel_item_content_id` = ?, `image_id` = ?", + (channel_item_content_id, image_id), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } + + pub fn image_id_by_sha256(&mut self, sha256: &str) -> Result, Error> { + self.tx.exec_first( + "SELECT `image_id` FROM `image` WHERE `sha256` = ? LIMIT 1", + (sha256,), + ) + } + + pub fn insert_image( + &mut self, + sha256: &str, + src: Option<&str>, + url: Option<&str>, + data: &[u8], + ) -> Result { + self.tx.exec_drop( + "INSERT INTO `image` SET `sha256` = ?, `src` = ?, `url` = ?, `data` = ?", + (sha256, src, url, data), + )?; + Ok(self.tx.last_insert_id().unwrap()) + } +} diff --git a/crates/mysql/version/0.1.0.sql b/crates/mysql/version/0.1.0.sql new file mode 100644 index 0000000..443595f --- /dev/null +++ b/crates/mysql/version/0.1.0.sql @@ -0,0 +1,202 @@ +-- MySQL Script generated by MySQL Workbench +-- нд, 11-січ-2026 21:01:10 +0200 +-- Model: New Model Version: 1.0 +-- MySQL Workbench Forward Engineering + +SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; +SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; +SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'; + +-- ----------------------------------------------------- +-- Schema rssto +-- ----------------------------------------------------- + +-- ----------------------------------------------------- +-- Schema rssto +-- ----------------------------------------------------- +CREATE SCHEMA IF NOT EXISTS `rssto` ; +USE `rssto` ; + +-- ----------------------------------------------------- +-- Table `rssto`.`channel` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel` ( + `channel_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `url` VARCHAR(255) NOT NULL, + PRIMARY KEY (`channel_id`), + UNIQUE INDEX `url_UNIQUE` (`url` ASC) VISIBLE) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` ( + `channel_item_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_id` INT UNSIGNED NOT NULL, + `pub_date` BIGINT NOT NULL, + `guid` VARCHAR(255) NOT NULL, + `link` VARCHAR(255) NOT NULL, + PRIMARY KEY (`channel_item_id`, `channel_id`), + INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`guid` ASC, `channel_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_channel` + FOREIGN KEY (`channel_id`) + REFERENCES `rssto`.`channel` (`channel_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`provider` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`provider` ( + `provider_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `name` VARCHAR(255) NOT NULL, + PRIMARY KEY (`provider_id`), + UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item_content` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content` ( + `channel_item_content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_id` INT UNSIGNED NOT NULL, + PRIMARY KEY (`channel_item_content_id`, `channel_item_id`), + INDEX `fk_channel_item_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_content_channel_item` + FOREIGN KEY (`channel_item_id`) + REFERENCES `rssto`.`channel_item` (`channel_item_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`image` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`image` ( + `image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `provider_id` INT UNSIGNED NULL, + `sha256` CHAR(64) NOT NULL, + `src` VARCHAR(2048) NULL, + `url` VARCHAR(2048) NULL, + `data` MEDIUMBLOB NOT NULL, + PRIMARY KEY (`image_id`), + UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE, + INDEX `fk_image_provider_idx` (`provider_id` ASC) VISIBLE, + CONSTRAINT `fk_image_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item_content_image` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_image` ( + `channel_item_content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_content_id` BIGINT UNSIGNED NOT NULL, + `image_id` BIGINT UNSIGNED NOT NULL, + PRIMARY KEY (`channel_item_content_image_id`), + INDEX `fk_channel_item_content_image_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE, + INDEX `fk_channel_item_content_image_image_idx` (`image_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_content_image_channel_item_content` + FOREIGN KEY (`channel_item_content_id`) + REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_item_content_image_image` + FOREIGN KEY (`image_id`) + REFERENCES `rssto`.`image` (`image_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_description` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_description` ( + `channel_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_id` INT UNSIGNED NOT NULL, + `provider_id` INT UNSIGNED NULL, + `title` TEXT NULL, + `description` LONGTEXT NULL, + PRIMARY KEY (`channel_description_id`), + INDEX `fk_channel_description_provider_idx` (`provider_id` ASC) VISIBLE, + INDEX `fk_channel_description_channel_idx` (`channel_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`channel_id` ASC, `provider_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_description_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_description_channel` + FOREIGN KEY (`channel_id`) + REFERENCES `rssto`.`channel` (`channel_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item_description` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_description` ( + `channel_item_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_id` INT UNSIGNED NOT NULL, + `provider_id` INT UNSIGNED NULL, + `title` TEXT NULL, + `description` LONGTEXT NULL, + INDEX `fk_channel_item_description_channel_item_idx` (`channel_item_id` ASC) VISIBLE, + INDEX `fk_channel_item_description_provider_idx` (`provider_id` ASC) VISIBLE, + PRIMARY KEY (`channel_item_description_id`), + UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_description_channel_item` + FOREIGN KEY (`channel_item_id`) + REFERENCES `rssto`.`channel_item` (`channel_item_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_item_description_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +-- ----------------------------------------------------- +-- Table `rssto`.`channel_item_content_description` +-- ----------------------------------------------------- +CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_description` ( + `channel_item_content_description_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `channel_item_content_id` BIGINT UNSIGNED NOT NULL, + `provider_id` INT UNSIGNED NULL, + `title` TEXT NULL, + `description` LONGTEXT NULL, + PRIMARY KEY (`channel_item_content_description_id`), + INDEX `fk_channel_item_content_description_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE, + INDEX `fk_channel_item_content_description_provider_idx` (`provider_id` ASC) VISIBLE, + UNIQUE INDEX `UNIQUE` (`channel_item_content_id` ASC, `provider_id` ASC) VISIBLE, + CONSTRAINT `fk_channel_item_content_description_channel_item_content` + FOREIGN KEY (`channel_item_content_id`) + REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION, + CONSTRAINT `fk_channel_item_content_description_provider` + FOREIGN KEY (`provider_id`) + REFERENCES `rssto`.`provider` (`provider_id`) + ON DELETE NO ACTION + ON UPDATE NO ACTION) +ENGINE = InnoDB; + + +SET SQL_MODE=@OLD_SQL_MODE; +SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; +SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; diff --git a/src/config.rs b/src/config.rs deleted file mode 100644 index 2fb41ec..0000000 --- a/src/config.rs +++ /dev/null @@ -1,32 +0,0 @@ -use serde::Deserialize; -use std::path::PathBuf; -use url::Url; - -#[derive(Debug, Deserialize)] -pub struct Feed { - /// RSS feed source - pub url: Url, - - /// Destination directory - pub storage: PathBuf, - - /// Path to templates (export formats) - pub templates: Vec, - - /// Limit channel items (unlimited by default) - pub list_items_limit: Option, - - pub pub_date_format: String, - pub last_build_date_format: String, - pub time_generated_format: String, -} - -#[derive(Debug, Deserialize)] -pub struct Config { - pub feed: Vec, - - /// Update timeout in seconds - /// - /// * None to generate once - pub update: Option, -} diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 8aae376..0000000 --- a/src/main.rs +++ /dev/null @@ -1,148 +0,0 @@ -mod argument; -mod config; - -use anyhow::Result; -use argument::Argument; -use chrono::{DateTime, Local}; -use clap::Parser; -use config::{Config, Feed}; -use log::{debug, info, warn}; -use std::{ - env::var, - fs::{File, create_dir_all, read_to_string}, - io::Write, - path::PathBuf, -}; -use strip_tags::*; - -fn main() -> Result<()> { - if var("RUST_LOG").is_ok() { - use tracing_subscriber::{EnvFilter, fmt::*}; - struct T; - impl time::FormatTime for T { - fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result { - write!(w, "{}", Local::now()) - } - } - fmt() - .with_timer(T) - .with_env_filter(EnvFilter::from_default_env()) - .init() - } - - let argument = Argument::parse(); - let config: Config = toml::from_str(&read_to_string(argument.config)?)?; - - info!("Crawler started"); - - loop { - debug!("Begin new crawl queue..."); - - for feed in &config.feed { - debug!("Update `{}`...", feed.url); - if let Err(e) = crawl(feed) { - warn!("Feed `{}` update failed: `{e}`", feed.url) - } - } - - debug!("Crawl queue completed"); - - if let Some(update) = config.update { - debug!("Wait {update} seconds to continue...",); - std::thread::sleep(std::time::Duration::from_secs(update)) - } else { - return Ok(()); - } - } -} - -fn crawl(feed: &Feed) -> Result<()> { - use reqwest::blocking::get; - use rss::Channel; - - let channel = Channel::read_from(&get(feed.url.as_str())?.bytes()?[..])?; - let channel_items = channel.items(); - let channel_items_limit = feed.list_items_limit.unwrap_or(channel_items.len()); - let regex = regex::Regex::new(r"\n{3,}").unwrap(); - - for template in &feed.templates { - let root = PathBuf::from(template); - let extension = root.file_name().unwrap().to_string_lossy(); - - let index = { - let mut p = PathBuf::from(&root); - p.push(format!("index.{extension}")); - read_to_string(p)? - }; - - let index_item = { - let mut p = PathBuf::from(&root); - p.push("index"); - p.push(format!("item.{extension}")); - read_to_string(p)? - }; - - create_dir_all(&feed.storage)?; - File::create({ - let mut p = PathBuf::from(&feed.storage); - p.push(format!("index.{extension}")); - p - })? - .write_all( - index - .replace("{title}", &strip_tags(channel.title())) - .replace("{description}", &strip_tags(channel.description())) - .replace("{link}", channel.link()) - .replace("{language}", channel.language().unwrap_or_default()) - .replace( - "{pub_date}", - &time(channel.pub_date(), &feed.pub_date_format), - ) - .replace( - "{last_build_date}", - &time(channel.last_build_date(), &feed.last_build_date_format), - ) - .replace("{time_generated}", &time(None, &feed.time_generated_format)) - .replace( - "{items}", - &channel_items - .iter() - .take(channel_items_limit) - .map(|i| { - regex - .replace_all( - &index_item - .replace( - "{title}", - &strip_tags(i.title().unwrap_or_default()), - ) - .replace( - "{description}", - &strip_tags(i.description().unwrap_or_default()), - ) - .replace("{link}", i.link().unwrap_or_default()) - .replace( - "{pub_date}", - &time(i.pub_date(), &feed.pub_date_format), - ), - "\n\n", - ) - .to_string() - }) - .collect::(), - ) - .as_bytes(), - )? - } - - Ok(()) -} - -fn time(value: Option<&str>, format: &str) -> String { - match value { - Some(v) => DateTime::parse_from_rfc2822(v).unwrap(), - None => Local::now().into(), - } - .format(format) - .to_string() -} diff --git a/template/gmi/index.gmi b/template/gmi/index.gmi deleted file mode 100644 index c6cbf99..0000000 --- a/template/gmi/index.gmi +++ /dev/null @@ -1,7 +0,0 @@ -# {title} - -{description} - -## {time_generated} - -{items} \ No newline at end of file diff --git a/template/gmi/index/item.gmi b/template/gmi/index/item.gmi deleted file mode 100644 index dea579f..0000000 --- a/template/gmi/index/item.gmi +++ /dev/null @@ -1,6 +0,0 @@ - -### {title} - -{description} - -=> {link} {pub_date} diff --git a/template/html/index.html b/template/html/index.html deleted file mode 100644 index 1d16c3e..0000000 --- a/template/html/index.html +++ /dev/null @@ -1,49 +0,0 @@ - - - - - - {title} - - - -
-

{title}

- {description} -
-
- {items} -
-
-

- Source: {title} | - Updated: {pub_date} | - Build: {last_build_date} | - Generated: {time_generated} -

-

- Powered by rssto. -

-
- - \ No newline at end of file diff --git a/template/html/index/item.html b/template/html/index/item.html deleted file mode 100644 index b60abde..0000000 --- a/template/html/index/item.html +++ /dev/null @@ -1,5 +0,0 @@ - \ No newline at end of file