This commit is contained in:
oooo-ps 2026-01-22 13:52:25 +00:00 committed by GitHub
commit ad46b34813
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
46 changed files with 1969 additions and 356 deletions

1
.gitignore vendored
View file

@ -1,3 +1,2 @@
/public
/target
Cargo.lock

View file

@ -1,24 +1,8 @@
[package]
name = "rssto"
version = "0.2.2"
edition = "2024"
license = "MIT"
readme = "README.md"
description = "Convert RSS feeds into multiple formats"
keywords = ["rss", "aggregator", "conversion", "html", "gemtext"]
categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"]
repository = "https://github.com/YGGverse/rssto"
[dependencies]
anyhow = "1.0"
chrono = "^0.4.20"
clap = { version = "4.5", features = ["derive"] }
log = "0.4"
regex = "1.12"
reqwest = { version = "0.12", features = ["blocking"] }
rss = "2.0"
serde = { version = "1.0", features = ["derive"] }
strip-tags = "0.1"
toml = "0.9"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
url = "2.5"
[workspace]
resolver = "2"
members = [
"crates/crawler",
"crates/http",
"crates/llm",
"crates/mysql",
]

View file

@ -4,70 +4,14 @@
[![Dependencies](https://deps.rs/repo/github/YGGverse/rssto/status.svg)](https://deps.rs/repo/github/YGGverse/rssto)
[![crates.io](https://img.shields.io/crates/v/rssto.svg)](https://crates.io/crates/rssto)
Convert RSS feeds into multiple formats
Crawl content from RSS feeds into multiple formats
## Features
> [!NOTE]
> Branch in development!
* [x] Multiple feed sources with flexible TOML config options
* [x] Limit channel items
* [x] Format time
* [x] Multiple export format definition
* [x] Custom templates
* [x] Single export or daemon mode with update time
* [x] Export formats:
* [x] HTML
* [x] [Gemtext](https://geminiprotocol.net/docs/gemtext.gmi)
## Components
## Install
``` bash
cargo install rssto
```
## Launch
``` bash
rssto -c config/example.toml
```
> [!TIP]
> * prepend `RUST_LOG=DEBUG` to print worker details (supported [levels](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.LevelFilter.html))
> * append `-u TIME` to run as the daemon with `TIME` interval update
> * see `rssto --help` to print all available options
### Systemd
1. Install `rssto` by copy the binary compiled into the native system apps destination:
* Linux: `sudo install /home/user/.cargo/bin/rssto /usr/local/bin/rssto`
2. Create `systemd` configuration file at `/etc/systemd/system/rssto.service`:
``` rssto.service
[Unit]
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=rssto
Group=rssto
# Uncomment for debug
# Environment="RUST_LOG=DEBUG"
# Environment="NO_COLOR=1"
ExecStart=/usr/local/bin/rssto -c /path/to/config.toml
StandardOutput=file:///home/rssto/debug.log
StandardError=file:///home/rssto/error.log
[Install]
WantedBy=multi-user.target
```
* example above requires new system user (`useradd -m rssto`)
3. Run in priority:
* `systemctl daemon-reload` - reload systemd configuration
* `systemctl enable rssto` - enable new service
* `systemctl start rssto` - start the process
* `systemctl status rssto` - check process launched
* `rssto-crawler` - RSS feed reader and data scrapper daemon
* `rssto-http` - Web server implementation based on the Rocket engine
* `rssto-llm` - Feeds auto-translation
* `rssto-mysql` - Shared database library

View file

@ -1,19 +0,0 @@
update = 60
[[feed]]
url = "https://assets.censor.net/rss/censor.net/rss_uk_news.xml"
storage = "./public/censor.net/rss_uk_news"
templates = ["./template/html","./template/gmi"]
list_items_limit = 20
pub_date_format = "%Y/%m/%d %H:%M:%S %z"
last_build_date_format = "%Y/%m/%d %H:%M:%S %z"
time_generated_format = "%Y/%m/%d %H:%M:%S %z"
[[feed]]
url = "https://assets.censor.net/rss/censor.net/rss_uk_resonance.xml"
storage = "./public/censor.net/rss_uk_resonance"
templates = ["./template/html","./template/gmi"]
list_items_limit = 20
pub_date_format = "%Y/%m/%d %H:%M:%S %z"
last_build_date_format = "%Y/%m/%d %H:%M:%S %z"
time_generated_format = "%Y/%m/%d %H:%M:%S %z"

26
crates/crawler/Cargo.toml Normal file
View file

@ -0,0 +1,26 @@
[package]
name = "rssto-crawler"
version = "0.1.0"
edition = "2024"
license = "MIT"
readme = "README.md"
description = "Crawl RSS feeds into MySQL database"
keywords = ["rss", "aggregator", "conversion", "mysql", "crawler"]
categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"]
repository = "https://github.com/YGGverse/rssto"
[dependencies]
ammonia = "4.1.2"
anyhow = "1.0.100"
chrono = "0.4.42"
clap = { version = "4.5.54", features = ["derive"] }
log = "0.4.29"
mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transaction"], path = "../mysql" }
reqwest = { version = "0.13.1", features = ["blocking"] }
rss = "2.0.12"
scraper = { version = "0.25.0", features = ["serde"] }
serde = { version = "1.0.228", features = ["derive"] }
sha2 = "0.10.9"
toml = "0.9.10"
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
url = { version = "2.5.8", features = ["serde"] }

21
crates/crawler/LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 YGGverse
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

58
crates/crawler/README.md Normal file
View file

@ -0,0 +1,58 @@
# rssto-crawler
## Install
``` bash
git clone https://github.com/YGGverse/rssto.git
cd rssto
cargo build --release
```
## Launch
``` bash
rssto-crawler -c config/example.toml
```
> [!TIP]
> * prepend `RUST_LOG=rssto_crawler=trace` to print worker details (supported [levels](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.LevelFilter.html))
> * or just `RUST_LOG=trace` to debug all components in use
> * append `-u TIME` to run as the daemon with `TIME` interval update
> * see `rssto-crawler --help` to print all available options
### Systemd
1. Install `rssto-crawler` by copy the binary compiled into the native system apps destination:
* Linux: `sudo install target/release/rssto-crawler /usr/local/bin/rssto-crawler`
2. Create `systemd` configuration file at `/etc/systemd/system/rssto-crawler.service`:
``` rssto-crawler.service
[Unit]
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=rssto
Group=rssto
# Uncomment for debug
# Environment="RUST_LOG=rssto_crawler=debug"
# Environment="NO_COLOR=1"
ExecStart=/usr/local/bin/rssto-crawler -c /path/to/config.toml
StandardOutput=file:///home/rssto/crawler-debug.log
StandardError=file:///home/rssto/crawler-error.log
[Install]
WantedBy=multi-user.target
```
* example above requires new system user (`useradd -m rssto`)
3. Run in priority:
* `systemctl daemon-reload` - reload systemd configuration
* `systemctl enable rssto-crawler` - enable new service
* `systemctl start rssto-crawler` - start the process
* `systemctl status rssto-crawler` - check process launched

View file

@ -0,0 +1,80 @@
# Rescan feed channels time, in seconds
update = 900
# Database connection setup
# * see crates/mysql/database
[mysql]
host = "localhost"
port = 3306
username = ""
password = ""
database = "rssto"
# Content sources (unlimited)
[[channel]]
# RSS feed source
url = "https://1"
# Limit latest channel items to crawl (unlimited by default)
items_limit = 5
# Save Channel `title` and `description` in the database (currently not in use)
persist_description = true
# Save Channel item `title` and `description` in the database
persist_item_description = true
# Allowed tags
# * empty to strip all tags (default)
allowed_tags = ["a", "br", "p", "img"]
# Grab Channel item content (from the item `link`)
scrape_item_content = false
# Scrape title by CSS selector
# * None to use Channel item title if exists or fail to continue
# scrape_item_content_title_selector = "h1"
# Scrape description by CSS selector
# * None to use Channel item description if exists or fail to continue
# scrape_item_content_description_selector = "article"
# Preload content images locally if `Some`
# * currently stored in the database
# persist_images_selector = "img"
[[channel]]
# RSS feed source
url = "https://2"
# Limit latest channel items to crawl (unlimited by default)
items_limit = 5
# Save Channel `title` and `description` in the database (currently not in use)
persist_description = true
# Save Channel item `title` and `description` in the database
persist_item_description = true
# Allowed tags
# * empty to strip all tags (default)
allowed_tags = ["a", "br", "p", "img"]
# Grab Channel item content (from the item `link`)
scrape_item_content = false
# Scrape title by CSS selector
# * None to use Channel item title if exists or fail to continue
# scrape_item_content_title_selector = "h1"
# Scrape description by CSS selector
# * None to use Channel item description if exists or fail to continue
# scrape_item_content_description_selector = "article"
# Preload content images locally if `Some`
# * currently stored in the database
# persist_images_selector = "img"

View file

@ -6,7 +6,7 @@ use std::path::PathBuf;
pub struct Argument {
/// Path to config file
///
/// * see `config/example.toml`
/// * see `config.toml`
#[arg(short, long)]
pub config: PathBuf,
}

View file

@ -0,0 +1,47 @@
use scraper::Selector;
use serde::Deserialize;
use url::Url;
#[derive(Debug, Deserialize)]
pub struct Mysql {
pub database: String,
pub host: String,
pub password: String,
pub port: u16,
pub username: String,
}
#[derive(Debug, Deserialize)]
pub struct Channel {
/// RSS feed source
pub url: Url,
/// Limit latest channel items to crawl (unlimited by default)
pub items_limit: Option<usize>,
/// Save Channel title and description in the database
pub persist_description: bool,
/// Save Channel item title and description in the database
pub persist_item_description: bool,
/// Grab Channel item content (from the item `link`)
pub scrape_item_content: bool,
/// Scrape title by CSS selector
/// * None to use Channel item title if exists or fail to continue
pub scrape_item_content_title_selector: Option<Selector>,
/// Scrape description by CSS selector
/// * None to use Channel item description if exists or fail to continue
pub scrape_item_content_description_selector: Option<Selector>,
/// Allowed tags
/// * empty to strip all tags (default)
pub allowed_tags: std::collections::HashSet<String>,
/// Preload content images locally if `Some`
/// * currently stored in the database
pub persist_images_selector: Option<Selector>,
}
#[derive(Debug, Deserialize)]
pub struct Config {
pub mysql: Mysql,
pub channel: Vec<Channel>,
/// Channels update timeout in seconds
/// * None to generate once
pub update: Option<u64>,
}

219
crates/crawler/src/main.rs Normal file
View file

@ -0,0 +1,219 @@
mod argument;
mod config;
use anyhow::{Result, bail};
use log::{debug, info, warn};
use reqwest::blocking::get;
use url::Url;
fn main() -> Result<()> {
use chrono::Local;
use clap::Parser;
use std::{env::var, fs::read_to_string};
if var("RUST_LOG").is_ok() {
use tracing_subscriber::{EnvFilter, fmt::*};
struct T;
impl time::FormatTime for T {
fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result {
write!(w, "{}", Local::now())
}
}
fmt()
.with_timer(T)
.with_env_filter(EnvFilter::from_default_env())
.init()
}
let argument = argument::Argument::parse();
let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?;
let db = mysql::Database::pool(
&config.mysql.host,
config.mysql.port,
&config.mysql.username,
&config.mysql.password,
&config.mysql.database,
)?;
info!("Crawler started");
loop {
debug!("Begin new crawl queue...");
for c in &config.channel {
debug!("Update `{}`...", c.url);
let mut tx = db.transaction()?;
match crawl(&mut tx, c) {
Ok(()) => tx.commit()?,
Err(e) => {
warn!("Channel `{}` update failed: `{e}`", c.url);
tx.rollback()?
}
}
}
debug!("Crawl queue completed");
if let Some(update) = config.update {
debug!("Wait {update} seconds to continue...",);
std::thread::sleep(std::time::Duration::from_secs(update))
} else {
return Ok(());
}
}
}
fn crawl(tx: &mut mysql::Transaction, channel_config: &config::Channel) -> Result<()> {
use std::collections::HashSet;
/// Removes all tags from `html` excluding `allowed_tags` or all if None
fn strip_tags(html: &str, allowed_tags: Option<&HashSet<String>>) -> String {
ammonia::Builder::new()
.tags(allowed_tags.map_or(HashSet::new(), |a| a.iter().map(|t| t.as_str()).collect()))
.clean(html)
.to_string()
}
let channel_url = channel_config.url.to_string(); // allocate once
let channel_id = match tx.channel_id_by_url(&channel_url)? {
Some(channel_id) => channel_id,
None => {
let channel_id = tx.insert_channel(&channel_url)?;
info!("Register new channel #{channel_id} ({channel_url})");
channel_id
}
};
let channel_items =
match rss::Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..]) {
Ok(channel) => {
if channel_config.persist_description {
let channel_description_id = tx.insert_channel_description(
channel_id,
None,
Some(strip_tags(channel.title(), None)),
Some(strip_tags(
channel.description(),
Some(&channel_config.allowed_tags),
)),
)?;
debug!("Save channel description #{channel_description_id}")
}
channel.into_items()
}
Err(e) => bail!("Could not parse response: `{e}`"),
};
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
for channel_item in channel_items.iter().take(channel_items_limit) {
let guid = match channel_item.guid {
Some(ref guid) => guid.value.as_ref(),
None => bail!("Undefined `guid` field"),
};
let (link, base) = match channel_item.link {
Some(ref link) => (link, Url::parse(link)?),
None => bail!("Undefined `link` field"),
};
let pub_date = match channel_item.pub_date {
Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
Ok(t) => t.timestamp(),
Err(e) => bail!("Invalid `pub_date` field: `{e}`"),
},
None => bail!("Undefined `pub_date`"),
};
if tx.channel_items_total_by_channel_id_guid(channel_id, guid)? > 0 {
debug!("Channel item `{guid}` already exists, skipped.");
continue; // skip next steps as processed
}
let channel_item_id = tx.insert_channel_item(channel_id, pub_date, guid, link)?;
info!("Register new channel item #{channel_item_id} ({link})");
if channel_config.persist_item_description {
let channel_item_description_id = tx.insert_channel_item_description(
channel_item_id,
None,
channel_item.title().map(|s| strip_tags(s, None)),
channel_item
.description()
.map(|s| strip_tags(s, Some(&channel_config.allowed_tags))),
)?;
debug!("Save channel item description #{channel_item_description_id}")
}
// preload remote content..
if !channel_config.scrape_item_content {
continue;
}
let channel_item_content_id = tx.insert_channel_item_content(channel_item_id)?;
info!("Add new content record #{channel_item_content_id}");
let html = scraper::Html::parse_document(&get(link)?.text()?);
let description = match channel_config.scrape_item_content_description_selector {
Some(ref selector) => match html.select(selector).next() {
Some(description) => Some(strip_tags(
&description.inner_html(),
Some(&channel_config.allowed_tags),
)),
None => bail!("Could not scrape `description` selector from `{link}`"),
},
None => None,
};
let channel_item_content_description_id = tx.insert_channel_item_content_description(
channel_item_content_id,
None,
match channel_config.scrape_item_content_title_selector {
Some(ref selector) => match html.select(selector).next() {
Some(title) => Some(strip_tags(&title.inner_html(), None)),
None => bail!("Could not scrape `title` selector from `{link}`"),
},
None => None,
}
.as_ref()
.map(|s| s.trim()),
description.as_ref().map(|s| s.trim()),
)?;
debug!("Save channel item content description #{channel_item_content_description_id}");
// persist images if enabled
if let Some(ref selector) = channel_config.persist_images_selector {
use sha2::{Digest, Sha256};
if description.is_none() {
bail!("Field `description` is required to scrape images from `{link}`")
}
for element in scraper::Html::parse_document(&description.unwrap()).select(selector) {
if let Some(src) = element.value().attr("src") {
let absolute = match Url::parse(src) {
Ok(url) => url,
Err(e) => {
if e == url::ParseError::RelativeUrlWithoutBase {
let absolute = base.join(link)?;
debug!("Convert relative image link `{link}` to `{absolute}`");
absolute
} else {
bail!("Could not parse URL from img source: `{e}`")
}
}
};
let url = absolute.as_str();
let data = get(url)?.bytes()?;
let hash = format!("{:x}", Sha256::digest(&data));
let image_id = match tx.image_id_by_sha256(&hash)? {
Some(image_id) => image_id,
None => {
let image_id = tx.insert_image(&hash, Some(src), Some(url), &data)?;
info!("Persist new image #{image_id} (`{absolute}`)");
image_id
}
};
let channel_item_content_image_id =
tx.insert_channel_item_content_image(channel_item_content_id, image_id)?;
debug!("Add content image relationship #{channel_item_content_image_id}");
let uri = format!("/image/{image_id}");
tx.replace_channel_item_content_description(
channel_item_content_description_id,
src,
&uri,
)?;
debug!("Replace content image in description from `{src}` to `{uri}`")
}
}
}
}
Ok(())
}

19
crates/http/Cargo.toml Normal file
View file

@ -0,0 +1,19 @@
[package]
name = "rssto-http"
version = "0.1.0"
edition = "2024"
license = "MIT"
readme = "README.md"
description = "Web server for the rssto DB, based on Rocket engine"
keywords = ["rss", "aggregator", "http", "server"]
categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"]
repository = "https://github.com/YGGverse/rssto"
[dependencies]
chrono = { version = "0.4.41", features = ["serde"] }
clap = { version = "4.5.54", features = ["derive"] }
mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" }
rocket = "0.5.1"
rocket_dyn_templates = { version = "0.2.0", features = ["tera"] }
serde = { version = "1.0.228", features = ["derive"] }
toml = "0.9.10"

21
crates/http/LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 YGGverse
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

11
crates/http/README.md Normal file
View file

@ -0,0 +1,11 @@
# rssto-http
Web server implementation based on the Rocket engine
> [!NOTE]
> In development!
```
cd rssto/crates/rssto-http
cargo run -- -c /path/to/config.toml
```

29
crates/http/config.toml Normal file
View file

@ -0,0 +1,29 @@
title = "rssto"
#description = ""
format_time = "%d/%m/%Y %H:%M"
# Provider ID (`provider` table)
# * None for the original content
# provider_id = 1
# Default listing limit
list_limit = 20
# Bind server on given host
host = "127.0.0.1"
# Bind server on given port
port = 8000
#Configure instance in the debug mode
debug = true
# Database connection setup
# * see crates/mysql/database
[mysql]
host = "localhost"
port = 3306
username = ""
password = ""
database = "rssto"

View file

@ -0,0 +1,12 @@
use clap::Parser;
use std::path::PathBuf;
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
pub struct Argument {
/// Path to config file
///
/// * see `config.toml`
#[arg(short, long)]
pub config: PathBuf,
}

24
crates/http/src/config.rs Normal file
View file

@ -0,0 +1,24 @@
use serde::Deserialize;
use std::net::IpAddr;
#[derive(Debug, Deserialize)]
pub struct Mysql {
pub database: String,
pub host: String,
pub password: String,
pub port: u16,
pub username: String,
}
#[derive(Debug, Deserialize)]
pub struct Config {
pub mysql: Mysql,
pub title: String,
pub description: Option<String>,
pub format_time: String,
pub provider_id: Option<u64>,
pub list_limit: usize,
pub host: IpAddr,
pub port: u16,
pub debug: bool,
}

58
crates/http/src/feed.rs Normal file
View file

@ -0,0 +1,58 @@
/// Export crawl index to the RSS file
pub struct Feed {
buffer: String,
}
impl Feed {
pub fn new(title: &str, description: Option<&str>, capacity: usize) -> Self {
let t = chrono::Utc::now().to_rfc2822();
let mut buffer = String::with_capacity(capacity);
buffer.push_str("<?xml version=\"1.0\" encoding=\"UTF-8\"?><rss version=\"2.0\"><channel>");
buffer.push_str(&format!("<pubDate>{t}</pubDate>"));
buffer.push_str(&format!("<lastBuildDate>{t}</lastBuildDate>"));
buffer.push_str(&format!("<title>{}</title>", escape(title)));
if let Some(d) = description {
buffer.push_str(&format!("<description>{}</description>", escape(d)));
}
Self { buffer }
}
/// Append `item` to the feed `channel`
pub fn push(
&mut self,
guid: u64,
time: chrono::DateTime<chrono::Utc>,
url: String,
title: String,
description: String,
) {
self.buffer.push_str(&format!(
"<item><guid>{guid}</guid><title>{}</title><link>{url}</link><description>{}</description><pubDate>{}</pubDate></item>",
escape(&title),
escape(&description),
time.to_rfc2822()
))
}
/// Write final bytes
pub fn commit(mut self) -> String {
self.buffer.push_str("</channel></rss>");
self.buffer
}
}
// @TODO use tera filters?
// https://keats.github.io/tera/docs/#built-in-filters
fn escape(value: &str) -> String {
value
.replace('&', "&amp;")
.replace('<', "&lt;")
.replace('>', "&gt;")
.replace('"', "&quot;")
.replace("'", "&apos;")
}

View file

@ -0,0 +1,9 @@
use rocket::serde::Serialize;
#[derive(Clone, Debug, Serialize)]
#[serde(crate = "rocket::serde")]
pub struct Global {
pub format_time: String,
pub list_limit: usize,
pub provider_id: Option<u64>,
}

309
crates/http/src/main.rs Normal file
View file

@ -0,0 +1,309 @@
#[macro_use]
extern crate rocket;
mod argument;
mod config;
mod feed;
mod global;
mod meta;
use chrono::{DateTime, Utc};
use feed::Feed;
use global::Global;
use meta::Meta;
use mysql::{Database, table::Sort};
use rocket::{
State,
http::{ContentType, Status},
response::content::RawXml,
serde::Serialize,
};
use rocket_dyn_templates::{Template, context};
#[get("/?<search>&<page>")]
fn index(
search: Option<&str>,
page: Option<usize>,
db: &State<Database>,
meta: &State<Meta>,
global: &State<Global>,
) -> Result<Template, Status> {
#[derive(Serialize)]
#[serde(crate = "rocket::serde")]
struct Row {
channel_item_content_description_id: u64,
link: String,
time: String,
title: String,
}
let mut conn = db.connection().map_err(|e| {
error!("Could not connect database: `{e}`");
Status::InternalServerError
})?;
let total = conn
.channel_item_content_descriptions_total_by_provider_id(global.provider_id, search)
.map_err(|e| {
error!("Could not get contents total: `{e}`");
Status::InternalServerError
})?;
Ok(Template::render(
"index",
context! {
title: {
let mut t = String::with_capacity(9);
if let Some(q) = search && !q.is_empty() {
t.push_str(q);
t.push_str(S);
t.push_str("Search");
t.push_str(S)
}
if let Some(p) = page && p > 1 {
t.push_str(&format!("Page {p}"));
t.push_str(S)
}
t.push_str(&meta.title);
if let Some(ref description) = meta.description
&& page.is_none_or(|p| p == 1) && search.is_none_or(|q| q.is_empty()) {
t.push_str(S);
t.push_str(description)
}
t
},
meta: meta.inner(),
back: page.map(|p| uri!(index(search, if p > 2 { Some(p - 1) } else { None }))),
next: if page.unwrap_or(1) * global.list_limit >= total { None }
else { Some(uri!(index(search, Some(page.map_or(2, |p| p + 1))))) },
rows: conn.channel_item_content_descriptions_by_provider_id(
global.provider_id,
search,
Sort::Desc,
page.map(|p| if p > 1 { p - 1 } else { 1 } * global.list_limit),
Some(global.list_limit)
).map_err(|e| {
error!("Could not get contents: `{e}`");
Status::InternalServerError
})?
.into_iter()
.map(|channel_item_content_description| {
let channel_item = conn.channel_item(
channel_item_content_description.channel_item_content_id
).unwrap().unwrap();
Row {
channel_item_content_description_id:
channel_item_content_description.channel_item_content_description_id,
link: channel_item.link,
time: time(channel_item.pub_date).format(&global.format_time).to_string(),
title: channel_item_content_description.title.unwrap_or_default(), // @TODO handle
}
})
.collect::<Vec<Row>>(),
page: page.unwrap_or(1),
pages: (total as f64 / global.list_limit as f64).ceil(),
total,
search
},
))
}
#[get("/<channel_item_content_description_id>")]
fn info(
channel_item_content_description_id: u64,
db: &State<Database>,
meta: &State<Meta>,
global: &State<Global>,
) -> Result<Template, Status> {
let mut conn = db.connection().map_err(|e| {
error!("Could not connect database: `{e}`");
Status::InternalServerError
})?;
match conn.channel_item_content_description(channel_item_content_description_id).map_err(|e| {
error!("Could not get `channel_item_content_description_id` {channel_item_content_description_id}: `{e}`");
Status::InternalServerError
})? {
Some(channel_item_content_description) => {
let channel_item_content = conn
.channel_item_content(channel_item_content_description.channel_item_content_id)
.map_err(|e| {
error!(
"Could not get requested `channel_item_content` #{}: `{e}`",
channel_item_content_description.channel_item_content_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!(
"Could not find requested `channel_item_content` #{}",
channel_item_content_description.channel_item_content_id
);
Status::NotFound
})?;
let channel_item = conn
.channel_item(channel_item_content.channel_item_id)
.map_err(|e| {
error!(
"Could not get requested `channel_item` #{}: `{e}`",
channel_item_content.channel_item_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!(
"Could not find requested `channel_item` #{}",
channel_item_content.channel_item_id
);
Status::NotFound
})?;
let title = channel_item_content_description.title.unwrap_or_default(); // @TODO handle
Ok(Template::render(
"info",
context! {
description: channel_item_content_description.description,
link: channel_item.link,
meta: meta.inner(),
title: format!("{title}{S}{}", meta.title),
name: title,
time: time(channel_item.pub_date).format(&global.format_time).to_string(),
},
))
}
None => Err(Status::NotFound),
}
}
#[get("/image/<image_id>")]
fn image(image_id: u64, db: &State<Database>) -> Result<(ContentType, Vec<u8>), Status> {
let mut conn = db.connection().map_err(|e| {
error!("Could not connect database: `{e}`");
Status::InternalServerError
})?;
match conn.image(image_id).map_err(|e| {
error!("Could not get content image `{image_id}`: `{e}`");
Status::InternalServerError
})? {
Some(image) => Ok((ContentType::Bytes, image.data)),
None => Err(Status::NotFound),
}
}
#[get("/rss?<search>")]
fn rss(
search: Option<&str>,
global: &State<Global>,
meta: &State<Meta>,
db: &State<Database>,
) -> Result<RawXml<String>, Status> {
let mut feed = Feed::new(
&meta.title,
meta.description.as_deref(),
1024, // @TODO
);
let mut conn = db.connection().map_err(|e| {
error!("Could not connect database: `{e}`");
Status::InternalServerError
})?;
for channel_item_content_description in conn
.channel_item_content_descriptions_by_provider_id(
global.provider_id,
search,
Sort::Desc,
None,
Some(global.list_limit),
)
.map_err(|e| {
error!(
"Could not load `channel_item_content_description` for `provider` #{:?}: `{e}`",
global.provider_id
);
Status::InternalServerError
})?
{
let channel_item_content = conn
.channel_item_content(channel_item_content_description.channel_item_content_id)
.map_err(|e| {
error!(
"Could not get requested `channel_item_content` #{}: `{e}`",
channel_item_content_description.channel_item_content_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!(
"Could not find requested `channel_item_content` #{}",
channel_item_content_description.channel_item_content_id
);
Status::NotFound
})?;
let channel_item = conn
.channel_item(channel_item_content.channel_item_id)
.map_err(|e| {
error!(
"Could not get requested `channel_item` #{}: `{e}`",
channel_item_content.channel_item_id
);
Status::InternalServerError
})?
.ok_or_else(|| {
error!(
"Could not find requested `channel_item` #{}",
channel_item_content.channel_item_id
);
Status::NotFound
})?;
feed.push(
channel_item_content_description.channel_item_content_description_id,
time(channel_item.pub_date),
channel_item.link,
channel_item_content_description.title.unwrap_or_default(), // @TODO handle
channel_item_content_description
.description
.unwrap_or_default(), // @TODO handle
)
}
Ok(RawXml(feed.commit()))
}
#[launch]
fn rocket() -> _ {
use clap::Parser;
let argument = argument::Argument::parse();
let config: config::Config =
toml::from_str(&std::fs::read_to_string(argument.config).unwrap()).unwrap();
rocket::build()
.attach(Template::fairing())
.configure(rocket::Config {
port: config.port,
address: config.host,
..if config.debug {
rocket::Config::debug_default()
} else {
rocket::Config::release_default()
}
})
.manage(
Database::pool(
&config.mysql.host,
config.mysql.port,
&config.mysql.username,
&config.mysql.password,
&config.mysql.database,
)
.unwrap(),
)
.manage(Global {
format_time: config.format_time,
list_limit: config.list_limit,
provider_id: config.provider_id,
})
.manage(Meta {
description: config.description,
title: config.title,
version: env!("CARGO_PKG_VERSION").into(),
})
.mount("/", routes![index, rss, info, image])
}
const S: &str = "";
fn time(timestamp: i64) -> DateTime<Utc> {
DateTime::<Utc>::from_timestamp(timestamp, 0).unwrap()
}

9
crates/http/src/meta.rs Normal file
View file

@ -0,0 +1,9 @@
use rocket::serde::Serialize;
#[derive(Clone, Debug, Serialize)]
#[serde(crate = "rocket::serde")]
pub struct Meta {
pub description: Option<String>,
pub title: String,
pub version: String,
}

View file

@ -0,0 +1,21 @@
{% extends "layout" %}
{% block content %}
{% if rows %}
{% for row in rows %}
<div>
<a name="{{ row.channel_item_content_description_id }}"></a>
<h2><a href="{{ row.channel_item_content_description_id }}">{{ row.title }}</a></h2>
<p>{{ row.time }}</p>
</div>
{% endfor %}
{% else %}
<div>
<p>Nothing.</p>
</div>
{% endif %}
{% if next %}<a href="{{ next }}">Next</a>{% endif %}
{% if back %}<a href="{{ back }}">Back</a>{% endif %}
{% if total %}
<p>Page {{ page }} / {{ pages }} ({{ total }} total)</p>
{% endif %}
{% endblock content %}

View file

@ -0,0 +1,10 @@
{% extends "layout" %}
{% block content %}
<div>
<h1>{{ name }}</h1>
<p><a href="{{ link }}">{{ time }}</a></p>
<div>
{{ description | safe }}
</div>
</div>
{% endblock content %}

View file

@ -0,0 +1,25 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8" />
<title>{{ title }}</title>
{% if meta.description %}
<meta name="description" content="{{ meta.description }}" />
{% endif %}
<style>
* {color-scheme: light dark}
</style>
</head>
<body>
<header>
<h1><a href="/">{{ meta.title }}</a></h1>
<form action="/" method="GET">
<input type="text" name="search" value="{% if search %}{{ search }}{% endif %}" placeholder="Keyword..." />
<input type="submit" value="Search" />
</form>
</header>
<main>
{% block content %}{% endblock content %}
</main>
</body>
</html>

22
crates/llm/Cargo.toml Normal file
View file

@ -0,0 +1,22 @@
[package]
name = "rssto-llm"
version = "0.1.0"
edition = "2024"
license = "MIT"
readme = "README.md"
description = "LLM daemon for the rssto DB translations"
keywords = ["rss", "llm", "translation", "localization", "server"]
categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"]
repository = "https://github.com/YGGverse/rssto"
[dependencies]
anyhow = "1.0.100"
chrono = "0.4.42"
clap = { version = "4.5.54", features = ["derive"] }
lancor = "0.1.1"
log = "0.4.29"
mysql = { package = "rssto-mysql", version = "0.1.0", features = ["transaction"], path = "../mysql" }
serde = { version = "1.0.228", features = ["derive"] }
tokio = { version = "1.0", features = ["full"] }
toml = "0.9.10"
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }

21
crates/llm/LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 YGGverse
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

22
crates/llm/README.md Normal file
View file

@ -0,0 +1,22 @@
# rssto-llm
LLM daemon for the rssto DB translations
> [!NOTE]
> In development!
1. Setup `rssto-crawler` first and collect initial data
2. Run LLM server:
```
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
```
3. Launch `rssto-llm` to handle `content` DB:
```
cd rssto/crates/rssto-llm
cargo run -- -c /path/to/config.toml
```
* see `--help` to display all supported options

22
crates/llm/config.toml Normal file
View file

@ -0,0 +1,22 @@
# Rescan database for new subjects, in seconds
# * process once if not defined
# update = 900
# Database connection setup
# * see crates/mysql/database
[mysql]
host = "localhost"
port = 3306
username = ""
password = ""
database = "rssto"
# LLM connection setup
[llm]
scheme = "http"
host = "127.0.0.1"
port = 8080
# Model name
model = "ggml-org/gemma-3-1b-it-GGUF"
# Initial message for the `content` subject (e.g. `translate to...`)
message = "translate to english:"

View file

@ -0,0 +1,12 @@
use clap::Parser;
use std::path::PathBuf;
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
pub struct Argument {
/// Path to config file
///
/// * see `config.toml`
#[arg(short, long)]
pub config: PathBuf,
}

27
crates/llm/src/config.rs Normal file
View file

@ -0,0 +1,27 @@
use serde::Deserialize;
use std::net::IpAddr;
#[derive(Debug, Deserialize)]
pub struct Mysql {
pub database: String,
pub host: IpAddr,
pub password: String,
pub port: u16,
pub username: String,
}
#[derive(Debug, Deserialize)]
pub struct Llm {
pub scheme: String,
pub host: IpAddr,
pub port: u16,
pub model: String,
pub message: String,
}
#[derive(Debug, Deserialize)]
pub struct Config {
pub mysql: Mysql,
pub llm: Llm,
pub update: Option<u64>,
}

124
crates/llm/src/main.rs Normal file
View file

@ -0,0 +1,124 @@
mod argument;
mod config;
use anyhow::Result;
use mysql::Database;
#[tokio::main]
async fn main() -> Result<()> {
use chrono::Local;
use clap::Parser;
use lancor::{ChatCompletionRequest, LlamaCppClient, Message};
use log::{debug, info};
use std::env::var;
if var("RUST_LOG").is_ok() {
use tracing_subscriber::{EnvFilter, fmt::*};
struct T;
impl time::FormatTime for T {
fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result {
write!(w, "{}", Local::now())
}
}
fmt()
.with_timer(T)
.with_env_filter(EnvFilter::from_default_env())
.init()
}
let argument = argument::Argument::parse();
let config: config::Config = toml::from_str(&std::fs::read_to_string(argument.config)?)?;
let llm = LlamaCppClient::new(format!(
"{}://{}:{}",
config.llm.scheme, config.llm.host, config.llm.port
))?;
let db = Database::pool(
&config.mysql.host.to_string(),
config.mysql.port,
&config.mysql.username,
&config.mysql.password,
&config.mysql.database,
)?;
let provider_id = {
let mut conn = db.connection()?;
match conn.provider_id_by_name(&config.llm.model)? {
Some(provider_id) => {
debug!(
"Use existing DB provider #{} matches model name `{}`",
provider_id, &config.llm.model
);
provider_id
}
None => {
let provider_id = conn.insert_provider(&config.llm.model)?;
info!(
"Provider `{}` not found in database, created new one with ID `{provider_id}`",
&config.llm.model
);
provider_id
}
}
};
info!("Daemon started");
loop {
debug!("New queue begin...");
let mut tx = db.transaction()?;
for channel_item_content_description in
tx.channel_item_content_descriptions_queue_for_provider_id(provider_id)?
{
debug!(
"Begin generating `channel_item_content_description` #{} using `provider_id` #{provider_id}.",
channel_item_content_description.channel_item_content_description_id
);
let title = match channel_item_content_description.title {
Some(subject) => Some(
llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message(
Message::user(format!("{}\n{}", config.llm.message, subject)),
))
.await?
.choices[0]
.message
.content
.trim()
.to_string(),
),
None => None,
};
let description = match channel_item_content_description.description {
Some(subject) => Some(
llm.chat_completion(ChatCompletionRequest::new(&config.llm.model).message(
Message::user(format!("{}\n{}", config.llm.message, subject)),
))
.await?
.choices[0]
.message
.content
.trim()
.to_string(),
),
None => None,
};
let channel_item_content_description_id = tx.insert_channel_item_content_description(
channel_item_content_description.channel_item_content_id,
Some(provider_id),
title.as_deref(),
description.as_deref(),
)?;
info!(
"Create `channel_item_content_description` #{channel_item_content_description_id} by `provider_id` #{provider_id}."
);
}
tx.commit()?;
debug!("Queue completed");
if let Some(update) = config.update {
debug!("Wait {update} seconds to continue...");
std::thread::sleep(std::time::Duration::from_secs(update))
} else {
return Ok(());
}
}
}

17
crates/mysql/Cargo.toml Normal file
View file

@ -0,0 +1,17 @@
[package]
name = "rssto-mysql"
version = "0.1.0"
edition = "2024"
license = "MIT"
readme = "README.md"
description = "Shared MySQL database library"
keywords = ["rssto", "database", "mysql", "library", "api"]
# categories = []
repository = "https://github.com/YGGverse/rssto"
[features]
default = []
transaction = []
[dependencies]
mysql = "26.0.1"

21
crates/mysql/LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 YGGverse
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

6
crates/mysql/README.md Normal file
View file

@ -0,0 +1,6 @@
# rssto-mysql
Shared MySQL database library
> [!TIP]
> See `database.mwb` model or `version` directory to deploy

BIN
crates/mysql/database.mwb Normal file

Binary file not shown.

View file

@ -0,0 +1,143 @@
use crate::table::*;
use mysql::{Error, Pool, PooledConn, prelude::Queryable};
/// Safe, read-only operations used in client apps like `rssto-http`
pub struct Connection {
conn: PooledConn,
}
impl Connection {
pub fn create(pool: &Pool) -> Result<Self, Error> {
Ok(Self {
conn: pool.get_conn()?,
})
}
pub fn channel_item(&mut self, channel_item_id: u64) -> Result<Option<ChannelItem>, Error> {
self.conn.exec_first(
"SELECT `channel_item_id`,
`channel_id`,
`pub_date`,
`guid`,
`link` FROM `channel_item` WHERE `channel_item_id` = ?",
(channel_item_id,),
)
}
pub fn channel_item_content(
&mut self,
channel_item_content_id: u64,
) -> Result<Option<ChannelItemContent>, Error> {
self.conn.exec_first(
"SELECT `channel_item_content_id`,
`channel_item_id`
FROM `channel_item_content` WHERE `channel_item_content_id` = ?",
(channel_item_content_id,),
)
}
pub fn channel_item_content_description(
&mut self,
channel_item_content_description_id: u64,
) -> Result<Option<ChannelItemContentDescription>, Error> {
self.conn.exec_first(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description` FROM `channel_item_content_description`
WHERE `channel_item_content_description_id` = ?",
(channel_item_content_description_id,),
)
}
pub fn channel_item_content_descriptions_total_by_provider_id(
&mut self,
provider_id: Option<u64>,
keyword: Option<&str>,
) -> Result<usize, Error> {
let total: Option<usize> = match keyword {
Some(k) => self.conn.exec_first(
"SELECT COUNT(*) FROM `channel_item_content_description`
WHERE `provider_id` <=> ? AND `title` LIKE '%?%'",
(provider_id, k),
)?,
None => self.conn.exec_first(
"SELECT COUNT(*) FROM `channel_item_content_description`
WHERE `provider_id` <=> ?",
(provider_id,),
)?,
};
Ok(total.unwrap_or(0))
}
pub fn channel_item_content_descriptions_by_provider_id(
&mut self,
provider_id: Option<u64>,
keyword: Option<&str>,
sort: Sort,
start: Option<usize>,
limit: Option<usize>,
) -> Result<Vec<ChannelItemContentDescription>, Error> {
match keyword {
Some(k) => self.conn.exec(
format!(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description`
FROM `channel_item_content_description`
WHERE `provider_id` <=> ? AND `title` LIKE '%?%'
ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}",
start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id, k),
),
None => self.conn.exec(
format!(
"SELECT `channel_item_content_description_id`,
`channel_item_content_id`,
`provider_id`,
`title`,
`description`
FROM `channel_item_content_description`
WHERE `provider_id` <=> ?
ORDER BY `channel_item_content_description_id` {sort} LIMIT {},{}",
start.unwrap_or(0),
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id,),
),
}
}
pub fn image(&mut self, image_id: u64) -> Result<Option<Image>, Error> {
self.conn.exec_first(
"SELECT `image_id`,
`provider_id`,
`sha256`,
`src`,
`url`,
`data` FROM `image` WHERE `image_id` = ?",
(image_id,),
)
}
pub fn provider_id_by_name(&mut self, name: &str) -> Result<Option<u64>, Error> {
self.conn.exec_first(
"SELECT `provider_id` FROM `provider` WHERE `name` = ?",
(name,),
)
}
pub fn insert_provider(&mut self, name: &str) -> Result<u64, Error> {
self.conn
.exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?;
Ok(self.conn.last_insert_id())
}
}
const DEFAULT_LIMIT: usize = 100;

36
crates/mysql/src/lib.rs Normal file
View file

@ -0,0 +1,36 @@
mod connection;
pub mod table;
#[cfg(feature = "transaction")]
mod transaction;
pub use connection::Connection;
#[cfg(feature = "transaction")]
pub use transaction::Transaction;
pub struct Database {
pool: mysql::Pool,
}
impl Database {
pub fn pool(
host: &str,
port: u16,
user: &str,
password: &str,
database: &str,
) -> Result<Self, mysql::Error> {
Ok(Self {
pool: mysql::Pool::new(
format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(),
)?,
})
}
pub fn connection(&self) -> Result<Connection, mysql::Error> {
Connection::create(&self.pool)
}
#[cfg(feature = "transaction")]
pub fn transaction(&self) -> Result<Transaction, mysql::Error> {
Transaction::create(&self.pool)
}
}

74
crates/mysql/src/table.rs Normal file
View file

@ -0,0 +1,74 @@
use mysql::prelude::FromRow;
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Channel {
pub channel_id: u64,
pub url: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItem {
pub channel_item_id: u64,
pub channel_id: u64,
pub pub_date: i64,
pub guid: String,
pub link: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItemDescription {
pub channel_item_description_id: u64,
pub channel_item_id: u64,
pub provider_id: Option<u64>,
pub title: Option<String>,
pub description: Option<String>,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItemContent {
pub channel_item_content_id: u64,
pub channel_item_id: u64,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItemContentDescription {
pub channel_item_content_description_id: u64,
pub channel_item_content_id: u64,
pub provider_id: Option<u64>,
pub title: Option<String>,
pub description: Option<String>,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Provider {
pub provider_id: u64,
pub name: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Image {
pub image_id: u64,
pub provider_id: Option<u64>,
/// Keep image unique by comparing its data hash
pub sha256: String,
/// Original `src` tag value to post-replacing
pub src: Option<String>,
/// Resolved absolute URL
pub url: Option<String>,
/// Image data, MEDIUMBLOB (16M)
pub data: Vec<u8>,
}
pub enum Sort {
Asc,
Desc,
}
impl std::fmt::Display for Sort {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::Asc => write!(f, "ASC"),
Self::Desc => write!(f, "DESC"),
}
}
}

View file

@ -0,0 +1,195 @@
use crate::table::*;
use mysql::{Error, Pool, TxOpts, prelude::Queryable};
/// Safe, optimized read/write operations
/// mostly required by the `rssto-crawler` and `rssto-llm`
/// * all members implementation requires `commit` action
pub struct Transaction {
tx: mysql::Transaction<'static>,
}
impl Transaction {
pub fn create(pool: &Pool) -> Result<Self, Error> {
Ok(Self {
tx: pool.start_transaction(TxOpts::default())?,
})
}
pub fn commit(self) -> Result<(), Error> {
self.tx.commit()
}
pub fn rollback(self) -> Result<(), Error> {
self.tx.rollback()
}
pub fn channel_id_by_url(&mut self, url: &str) -> Result<Option<u64>, Error> {
self.tx.exec_first(
"SELECT `channel_id` FROM `channel` WHERE `url` = ? LIMIT 1",
(url,),
)
}
pub fn insert_channel(&mut self, url: &str) -> Result<u64, Error> {
self.tx
.exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn insert_channel_description(
&mut self,
channel_id: u64,
provider_id: Option<u64>,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_description` SET `channel_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn channel_items_total_by_channel_id_guid(
&mut self,
channel_id: u64,
guid: &str,
) -> Result<usize, Error> {
Ok(self
.tx
.exec_first(
"SELECT COUNT(*) FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ?",
(channel_id, guid),
)?
.unwrap_or(0))
}
pub fn insert_channel_item(
&mut self,
channel_id: u64,
pub_date: i64,
guid: &str,
link: &str,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item` SET `channel_id` = ?,
`pub_date` = ?,
`guid` = ?,
`link` = ?",
(channel_id, pub_date, guid, link),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn insert_channel_item_description(
&mut self,
channel_item_id: u64,
provider_id: Option<u64>,
title: Option<String>,
description: Option<String>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_description` SET `channel_item_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_item_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn channel_item_content_descriptions_queue_for_provider_id(
&mut self,
provider_id: u64,
) -> Result<Vec<ChannelItemContentDescription>, Error> {
self.tx.exec(
"SELECT `t1`.`channel_item_content_description_id`,
`t1`.`channel_item_content_id`,
`t1`.`provider_id`,
`t1`.`title`,
`t1`.`description`
FROM `channel_item_content_description` AS `t1`
WHERE `t1`.`provider_id` IS NULL AND NOT EXISTS (
SELECT NULL FROM `channel_item_content_description` AS `t2`
WHERE `t2`.`channel_item_content_description_id` = `t1`.`channel_item_content_description_id`
AND `t2`.`provider_id` = ? LIMIT 1
)",
(provider_id,),
)
} // @TODO upgrade to the latest version
pub fn insert_channel_item_content(&mut self, channel_item_id: u64) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_content` SET `channel_item_id` = ?",
(channel_item_id,),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn insert_channel_item_content_description(
&mut self,
channel_item_content_id: u64,
provider_id: Option<u64>,
title: Option<&str>,
description: Option<&str>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_content_description` SET `channel_item_content_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_item_content_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn replace_channel_item_content_description(
&mut self,
channel_item_content_description_id: u64,
from: &str,
to: &str,
) -> Result<(), Error> {
self.tx.exec_drop(
"UPDATE `channel_item_content_description`
SET `description` = REPLACE(`description`, ?, ?)
WHERE `channel_item_content_description_id` = ?",
(from, to, channel_item_content_description_id),
)
}
pub fn insert_channel_item_content_image(
&mut self,
channel_item_content_id: u64,
image_id: u64,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item_content_image` SET `channel_item_content_id` = ?, `image_id` = ?",
(channel_item_content_id, image_id),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn image_id_by_sha256(&mut self, sha256: &str) -> Result<Option<u64>, Error> {
self.tx.exec_first(
"SELECT `image_id` FROM `image` WHERE `sha256` = ? LIMIT 1",
(sha256,),
)
}
pub fn insert_image(
&mut self,
sha256: &str,
src: Option<&str>,
url: Option<&str>,
data: &[u8],
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `image` SET `sha256` = ?, `src` = ?, `url` = ?, `data` = ?",
(sha256, src, url, data),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
}

View file

@ -0,0 +1,202 @@
-- MySQL Script generated by MySQL Workbench
-- нд, 11-січ-2026 21:01:10 +0200
-- Model: New Model Version: 1.0
-- MySQL Workbench Forward Engineering
SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION';
-- -----------------------------------------------------
-- Schema rssto
-- -----------------------------------------------------
-- -----------------------------------------------------
-- Schema rssto
-- -----------------------------------------------------
CREATE SCHEMA IF NOT EXISTS `rssto` ;
USE `rssto` ;
-- -----------------------------------------------------
-- Table `rssto`.`channel`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel` (
`channel_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`url` VARCHAR(255) NOT NULL,
PRIMARY KEY (`channel_id`),
UNIQUE INDEX `url_UNIQUE` (`url` ASC) VISIBLE)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item` (
`channel_item_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_id` INT UNSIGNED NOT NULL,
`pub_date` BIGINT NOT NULL,
`guid` VARCHAR(255) NOT NULL,
`link` VARCHAR(255) NOT NULL,
PRIMARY KEY (`channel_item_id`, `channel_id`),
INDEX `fk_channel_item_channel_idx` (`channel_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`guid` ASC, `channel_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_channel`
FOREIGN KEY (`channel_id`)
REFERENCES `rssto`.`channel` (`channel_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`provider`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`provider` (
`provider_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`name` VARCHAR(255) NOT NULL,
PRIMARY KEY (`provider_id`),
UNIQUE INDEX `name_UNIQUE` (`name` ASC) VISIBLE)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_content`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content` (
`channel_item_content_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT UNSIGNED NOT NULL,
PRIMARY KEY (`channel_item_content_id`, `channel_item_id`),
INDEX `fk_channel_item_content_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_channel_item`
FOREIGN KEY (`channel_item_id`)
REFERENCES `rssto`.`channel_item` (`channel_item_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`image`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`image` (
`image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`provider_id` INT UNSIGNED NULL,
`sha256` CHAR(64) NOT NULL,
`src` VARCHAR(2048) NULL,
`url` VARCHAR(2048) NULL,
`data` MEDIUMBLOB NOT NULL,
PRIMARY KEY (`image_id`),
UNIQUE INDEX `hash_UNIQUE` (`sha256` ASC) VISIBLE,
INDEX `fk_image_provider_idx` (`provider_id` ASC) VISIBLE,
CONSTRAINT `fk_image_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_content_image`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_image` (
`channel_item_content_image_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_content_id` BIGINT UNSIGNED NOT NULL,
`image_id` BIGINT UNSIGNED NOT NULL,
PRIMARY KEY (`channel_item_content_image_id`),
INDEX `fk_channel_item_content_image_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE,
INDEX `fk_channel_item_content_image_image_idx` (`image_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_image_channel_item_content`
FOREIGN KEY (`channel_item_content_id`)
REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_item_content_image_image`
FOREIGN KEY (`image_id`)
REFERENCES `rssto`.`image` (`image_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_description` (
`channel_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_id` INT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_description_id`),
INDEX `fk_channel_description_provider_idx` (`provider_id` ASC) VISIBLE,
INDEX `fk_channel_description_channel_idx` (`channel_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_description_channel`
FOREIGN KEY (`channel_id`)
REFERENCES `rssto`.`channel` (`channel_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_description` (
`channel_item_description_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_id` INT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
INDEX `fk_channel_item_description_channel_item_idx` (`channel_item_id` ASC) VISIBLE,
INDEX `fk_channel_item_description_provider_idx` (`provider_id` ASC) VISIBLE,
PRIMARY KEY (`channel_item_description_id`),
UNIQUE INDEX `UNIQUE` (`channel_item_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_description_channel_item`
FOREIGN KEY (`channel_item_id`)
REFERENCES `rssto`.`channel_item` (`channel_item_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_item_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `rssto`.`channel_item_content_description`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `rssto`.`channel_item_content_description` (
`channel_item_content_description_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`channel_item_content_id` BIGINT UNSIGNED NOT NULL,
`provider_id` INT UNSIGNED NULL,
`title` TEXT NULL,
`description` LONGTEXT NULL,
PRIMARY KEY (`channel_item_content_description_id`),
INDEX `fk_channel_item_content_description_channel_item_content_idx` (`channel_item_content_id` ASC) VISIBLE,
INDEX `fk_channel_item_content_description_provider_idx` (`provider_id` ASC) VISIBLE,
UNIQUE INDEX `UNIQUE` (`channel_item_content_id` ASC, `provider_id` ASC) VISIBLE,
CONSTRAINT `fk_channel_item_content_description_channel_item_content`
FOREIGN KEY (`channel_item_content_id`)
REFERENCES `rssto`.`channel_item_content` (`channel_item_content_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION,
CONSTRAINT `fk_channel_item_content_description_provider`
FOREIGN KEY (`provider_id`)
REFERENCES `rssto`.`provider` (`provider_id`)
ON DELETE NO ACTION
ON UPDATE NO ACTION)
ENGINE = InnoDB;
SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;

View file

@ -1,32 +0,0 @@
use serde::Deserialize;
use std::path::PathBuf;
use url::Url;
#[derive(Debug, Deserialize)]
pub struct Feed {
/// RSS feed source
pub url: Url,
/// Destination directory
pub storage: PathBuf,
/// Path to templates (export formats)
pub templates: Vec<PathBuf>,
/// Limit channel items (unlimited by default)
pub list_items_limit: Option<usize>,
pub pub_date_format: String,
pub last_build_date_format: String,
pub time_generated_format: String,
}
#[derive(Debug, Deserialize)]
pub struct Config {
pub feed: Vec<Feed>,
/// Update timeout in seconds
///
/// * None to generate once
pub update: Option<u64>,
}

View file

@ -1,148 +0,0 @@
mod argument;
mod config;
use anyhow::Result;
use argument::Argument;
use chrono::{DateTime, Local};
use clap::Parser;
use config::{Config, Feed};
use log::{debug, info, warn};
use std::{
env::var,
fs::{File, create_dir_all, read_to_string},
io::Write,
path::PathBuf,
};
use strip_tags::*;
fn main() -> Result<()> {
if var("RUST_LOG").is_ok() {
use tracing_subscriber::{EnvFilter, fmt::*};
struct T;
impl time::FormatTime for T {
fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result {
write!(w, "{}", Local::now())
}
}
fmt()
.with_timer(T)
.with_env_filter(EnvFilter::from_default_env())
.init()
}
let argument = Argument::parse();
let config: Config = toml::from_str(&read_to_string(argument.config)?)?;
info!("Crawler started");
loop {
debug!("Begin new crawl queue...");
for feed in &config.feed {
debug!("Update `{}`...", feed.url);
if let Err(e) = crawl(feed) {
warn!("Feed `{}` update failed: `{e}`", feed.url)
}
}
debug!("Crawl queue completed");
if let Some(update) = config.update {
debug!("Wait {update} seconds to continue...",);
std::thread::sleep(std::time::Duration::from_secs(update))
} else {
return Ok(());
}
}
}
fn crawl(feed: &Feed) -> Result<()> {
use reqwest::blocking::get;
use rss::Channel;
let channel = Channel::read_from(&get(feed.url.as_str())?.bytes()?[..])?;
let channel_items = channel.items();
let channel_items_limit = feed.list_items_limit.unwrap_or(channel_items.len());
let regex = regex::Regex::new(r"\n{3,}").unwrap();
for template in &feed.templates {
let root = PathBuf::from(template);
let extension = root.file_name().unwrap().to_string_lossy();
let index = {
let mut p = PathBuf::from(&root);
p.push(format!("index.{extension}"));
read_to_string(p)?
};
let index_item = {
let mut p = PathBuf::from(&root);
p.push("index");
p.push(format!("item.{extension}"));
read_to_string(p)?
};
create_dir_all(&feed.storage)?;
File::create({
let mut p = PathBuf::from(&feed.storage);
p.push(format!("index.{extension}"));
p
})?
.write_all(
index
.replace("{title}", &strip_tags(channel.title()))
.replace("{description}", &strip_tags(channel.description()))
.replace("{link}", channel.link())
.replace("{language}", channel.language().unwrap_or_default())
.replace(
"{pub_date}",
&time(channel.pub_date(), &feed.pub_date_format),
)
.replace(
"{last_build_date}",
&time(channel.last_build_date(), &feed.last_build_date_format),
)
.replace("{time_generated}", &time(None, &feed.time_generated_format))
.replace(
"{items}",
&channel_items
.iter()
.take(channel_items_limit)
.map(|i| {
regex
.replace_all(
&index_item
.replace(
"{title}",
&strip_tags(i.title().unwrap_or_default()),
)
.replace(
"{description}",
&strip_tags(i.description().unwrap_or_default()),
)
.replace("{link}", i.link().unwrap_or_default())
.replace(
"{pub_date}",
&time(i.pub_date(), &feed.pub_date_format),
),
"\n\n",
)
.to_string()
})
.collect::<String>(),
)
.as_bytes(),
)?
}
Ok(())
}
fn time(value: Option<&str>, format: &str) -> String {
match value {
Some(v) => DateTime::parse_from_rfc2822(v).unwrap(),
None => Local::now().into(),
}
.format(format)
.to_string()
}

View file

@ -1,7 +0,0 @@
# {title}
{description}
## {time_generated}
{items}

View file

@ -1,6 +0,0 @@
### {title}
{description}
=> {link} {pub_date}

View file

@ -1,49 +0,0 @@
<!DOCTYPE html>
<html lang="{language}">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title}</title>
<style>
* {
color-scheme: light dark
}
body {
margin: 0 auto;
max-width: 1024px
}
header {
border-bottom: 1px #ccc dotted;
padding-bottom: 32px;
}
section > article {
border-bottom: 1px #ccc dotted;
padding-bottom: 32px;
}
footer {
font-size: small;
padding: 16px 0;
}
</style>
</head>
<body>
<header>
<h1>{title}</h1>
{description}
</header>
<section>
{items}
</section>
<footer>
<p>
Source: <a href="{link}">{title}</a> |
Updated: {pub_date} |
Build: {last_build_date} |
Generated: {time_generated}
</p>
<p>
Powered by <a href="https://github.com/YGGverse/rssto">rssto</a>.
</p>
</footer>
</body>
</html>

View file

@ -1,5 +0,0 @@
<article>
<h2>{title}</h2>
<p>{description}</p>
<a href="{link}">{pub_date}</a>
</article>