initial commit

This commit is contained in:
yggverse 2026-01-07 15:00:55 +02:00
parent 8dfc595961
commit e070db316c
19 changed files with 400 additions and 356 deletions

1
.gitignore vendored
View file

@ -1,3 +1,2 @@
/public
/target
Cargo.lock

View file

@ -1,24 +1,6 @@
[package]
name = "rssto"
version = "0.2.2"
edition = "2024"
license = "MIT"
readme = "README.md"
description = "Convert RSS feeds into multiple formats"
keywords = ["rss", "aggregator", "conversion", "html", "gemtext"]
categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"]
repository = "https://github.com/YGGverse/rssto"
[dependencies]
anyhow = "1.0"
chrono = "^0.4.20"
clap = { version = "4.5", features = ["derive"] }
log = "0.4"
regex = "1.12"
reqwest = { version = "0.12", features = ["blocking"] }
rss = "2.0"
serde = { version = "1.0", features = ["derive"] }
strip-tags = "0.1"
toml = "0.9"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
url = "2.5"
[workspace]
resolver = "2"
members = [
"crates/crawler",
"crates/mysql",
]

View file

@ -6,68 +6,5 @@
Convert RSS feeds into multiple formats
## Features
* [x] Multiple feed sources with flexible TOML config options
* [x] Limit channel items
* [x] Format time
* [x] Multiple export format definition
* [x] Custom templates
* [x] Single export or daemon mode with update time
* [x] Export formats:
* [x] HTML
* [x] [Gemtext](https://geminiprotocol.net/docs/gemtext.gmi)
## Install
``` bash
cargo install rssto
```
## Launch
``` bash
rssto -c config/example.toml
```
> [!TIP]
> * prepend `RUST_LOG=DEBUG` to print worker details (supported [levels](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.LevelFilter.html))
> * append `-u TIME` to run as the daemon with `TIME` interval update
> * see `rssto --help` to print all available options
### Systemd
1. Install `rssto` by copy the binary compiled into the native system apps destination:
* Linux: `sudo install /home/user/.cargo/bin/rssto /usr/local/bin/rssto`
2. Create `systemd` configuration file at `/etc/systemd/system/rssto.service`:
``` rssto.service
[Unit]
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=rssto
Group=rssto
# Uncomment for debug
# Environment="RUST_LOG=DEBUG"
# Environment="NO_COLOR=1"
ExecStart=/usr/local/bin/rssto -c /path/to/config.toml
StandardOutput=file:///home/rssto/debug.log
StandardError=file:///home/rssto/error.log
[Install]
WantedBy=multi-user.target
```
* example above requires new system user (`useradd -m rssto`)
3. Run in priority:
* `systemctl daemon-reload` - reload systemd configuration
* `systemctl enable rssto` - enable new service
* `systemctl start rssto` - start the process
* `systemctl status rssto` - check process launched
> [!NOTE]
> Branch in development!

View file

@ -1,19 +0,0 @@
update = 60
[[feed]]
url = "https://assets.censor.net/rss/censor.net/rss_uk_news.xml"
storage = "./public/censor.net/rss_uk_news"
templates = ["./template/html","./template/gmi"]
list_items_limit = 20
pub_date_format = "%Y/%m/%d %H:%M:%S %z"
last_build_date_format = "%Y/%m/%d %H:%M:%S %z"
time_generated_format = "%Y/%m/%d %H:%M:%S %z"
[[feed]]
url = "https://assets.censor.net/rss/censor.net/rss_uk_resonance.xml"
storage = "./public/censor.net/rss_uk_resonance"
templates = ["./template/html","./template/gmi"]
list_items_limit = 20
pub_date_format = "%Y/%m/%d %H:%M:%S %z"
last_build_date_format = "%Y/%m/%d %H:%M:%S %z"
time_generated_format = "%Y/%m/%d %H:%M:%S %z"

23
crates/crawler/Cargo.toml Normal file
View file

@ -0,0 +1,23 @@
[package]
name = "rssto-crawler"
version = "0.1.0"
edition = "2024"
license = "MIT"
readme = "README.md"
description = "Crawl RSS feeds into MySQL database"
keywords = ["rss", "aggregator", "conversion", "mysql", "crawler"]
categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"]
repository = "https://github.com/YGGverse/rssto"
[dependencies]
anyhow = "1.0.100"
chrono = "0.4.42"
clap = { version = "4.5.54", features = ["derive"] }
log = "0.4.29"
mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" }
reqwest = { version = "0.13.1", features = ["blocking"] }
rss = "2.0.12"
serde = { version = "1.0.228", features = ["derive"] }
toml = "0.9.10"
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
url = { version = "2.5.8", features = ["serde"] }

21
crates/crawler/LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 YGGverse
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -0,0 +1,20 @@
update = 60
[mysql]
host = "localhost"
port = 3307
user = ""
password = ""
database = "rssto"
[[channel]]
url = "https://"
items_limit = 20
persist_item_title = true
persist_item_description = true
[[channel]]
url = "https://"
items_limit = 20
persist_item_title = true
persist_item_description = true

View file

@ -0,0 +1,33 @@
use serde::Deserialize;
use url::Url;
#[derive(Debug, Deserialize)]
pub struct Mysql {
pub database: String,
pub host: String,
pub password: String,
pub port: u16,
pub user: String,
}
#[derive(Debug, Deserialize)]
pub struct Channel {
/// RSS feed source
pub url: Url,
/// Limit channel items (unlimited by default)
pub items_limit: Option<usize>,
/// Save item title
pub persist_item_title: bool,
/// Save item description
pub persist_item_description: bool,
}
#[derive(Debug, Deserialize)]
pub struct Config {
pub mysql: Mysql,
pub channel: Vec<Channel>,
/// Update timeout in seconds
///
/// * None to generate once
pub update: Option<u64>,
}

124
crates/crawler/src/main.rs Normal file
View file

@ -0,0 +1,124 @@
mod argument;
mod config;
use anyhow::Result;
use log::{debug, info, warn};
use mysql::Mysql;
fn main() -> Result<()> {
use argument::Argument;
use chrono::Local;
use clap::Parser;
use std::{env::var, fs::read_to_string};
if var("RUST_LOG").is_ok() {
use tracing_subscriber::{EnvFilter, fmt::*};
struct T;
impl time::FormatTime for T {
fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result {
write!(w, "{}", Local::now())
}
}
fmt()
.with_timer(T)
.with_env_filter(EnvFilter::from_default_env())
.init()
}
let argument = Argument::parse();
let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?;
let mut database = Mysql::connect(
&config.mysql.host,
config.mysql.port,
&config.mysql.user,
&config.mysql.password,
&config.mysql.database,
)?;
info!("Crawler started");
loop {
debug!("Begin new crawl queue...");
for feed in &config.channel {
debug!("Update `{}`...", feed.url);
if let Err(e) = crawl(&mut database, feed) {
warn!("Feed `{}` update failed: `{e}`", feed.url)
}
}
debug!("Crawl queue completed");
if let Some(update) = config.update {
debug!("Wait {update} seconds to continue...",);
std::thread::sleep(std::time::Duration::from_secs(update))
} else {
return Ok(());
}
}
}
fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
use reqwest::blocking::get;
use rss::Channel;
let channel = Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..])?;
let channel_items = channel.items();
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
let feed_url = channel_config.url.to_string();
let channel_id = match db.channels_by_url(&feed_url, Some(1))?.first() {
Some(result) => result.channel_id,
None => db.insert_channel(&feed_url)?,
};
for channel_item in channel_items.iter().take(channel_items_limit) {
let guid = match channel_item.guid {
Some(ref guid) => guid.value.clone(),
None => {
warn!("Undefined `guid` field in `{feed_url}`");
continue;
}
};
let link = match channel_item.guid {
Some(ref link) => link.value.clone(),
None => {
warn!("Undefined `link` field in `{feed_url}`");
continue;
}
};
let pub_date = match channel_item.pub_date {
Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
Ok(t) => t.timestamp(),
Err(e) => {
warn!("Invalid `pub_date` field in `{feed_url}`: `{e}`");
continue;
}
},
None => {
warn!("Undefined `pub_date` field in `{feed_url}`");
continue;
}
};
let channel_item_id = match db
.channel_items_by_channel_id_guid(channel_id, &guid, Some(1))?
.first()
{
Some(result) => result.channel_item_id,
None => db.insert_channel_item(
channel_id,
pub_date,
&guid,
&link,
if channel_config.persist_item_title {
channel_item.title()
} else {
None
},
if channel_config.persist_item_description {
channel_item.description()
} else {
None
},
)?,
}; // @TODO
}
Ok(())
}

13
crates/mysql/Cargo.toml Normal file
View file

@ -0,0 +1,13 @@
[package]
name = "rssto-mysql"
version = "0.1.0"
edition = "2024"
license = "MIT"
readme = "README.md"
description = "Shared MySQL database library"
keywords = ["rssto", "database", "mysql", "library", "driver", "api"]
# categories = []
repository = "https://github.com/YGGverse/rssto"
[dependencies]
mysql = "26.0.1"

21
crates/mysql/LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 YGGverse
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

137
crates/mysql/src/lib.rs Normal file
View file

@ -0,0 +1,137 @@
use mysql::{Error, PooledConn, prelude::Queryable};
pub struct Mysql {
connection: PooledConn,
}
impl Mysql {
pub fn connect(
host: &str,
port: u16,
user: &str,
password: &str,
database: &str,
) -> Result<Self, Error> {
Ok(Self {
connection: mysql::Pool::new(
format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(),
)?
.get_conn()?,
})
}
pub fn channels_by_url(
&mut self,
url: &str,
limit: Option<usize>,
) -> Result<Vec<Channel>, Error> {
self.connection.exec_map(
format!(
"SELECT `channel_id`, `url` FROM `channel` WHERE `url` = ? LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)
),
(url,),
|(channel_id, url)| Channel { channel_id, url },
)
}
pub fn insert_channel(&mut self, url: &str) -> Result<u64, Error> {
self.connection
.exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?;
Ok(self.connection.last_insert_id())
}
pub fn channel_items_by_channel_id_guid(
&mut self,
channel_id: u64,
guid: &str,
limit: Option<usize>,
) -> Result<Vec<ChannelItem>, Error> {
self.connection.exec_map(
format!(
"SELECT `channel_item_id`, `channel_id`, `guid`, `link`, `title`, `description` FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ? LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)),
(
channel_id,
guid
),
|(
channel_item_id,
channel_id,
pub_date,
guid,
link,
title,
description
)|
ChannelItem {
channel_item_id,
channel_id,
pub_date,
guid,
link,
title,
description
},
)
}
pub fn insert_channel_item(
&mut self,
channel_id: u64,
pub_date: i64,
guid: &str,
link: &str,
title: Option<&str>,
description: Option<&str>,
) -> Result<u64, Error> {
self.connection.exec_drop(
"INSERT INTO `channel_item` SET `channel_id` = ?, `pub_date` = ?, `guid` = ?, `link` = ?, `title` = ?, `description` = ?",
(channel_id, pub_date, guid, link, title, description),
)?;
Ok(self.connection.last_insert_id())
}
pub fn insert_content(
&mut self,
channel_item_id: u64,
source_id: Option<u64>,
title: &str,
description: &str,
) -> Result<(), Error> {
self.connection.exec_drop(
"INSERT INTO `content` SET `channel_item_id` = ?, `source_id` = ?, `title` = ?, `description` = ?",
(channel_item_id, source_id, title, description ),
)
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct Channel {
pub channel_id: u64,
pub url: String,
}
#[derive(Debug, PartialEq, Eq)]
pub struct ChannelItem {
pub channel_item_id: u64,
pub channel_id: u64,
pub pub_date: i32,
pub guid: String,
pub link: String,
pub title: Option<String>,
pub description: Option<String>,
}
#[derive(Debug, PartialEq, Eq)]
pub struct Content {
pub channel_item_id: u64,
/// None if the original `title` and `description` values
/// parsed from the channel item on crawl
pub source_id: Option<u64>,
pub title: String,
pub description: String,
}
const DEFAULT_LIMIT: usize = 100;

View file

@ -1,32 +0,0 @@
use serde::Deserialize;
use std::path::PathBuf;
use url::Url;
#[derive(Debug, Deserialize)]
pub struct Feed {
/// RSS feed source
pub url: Url,
/// Destination directory
pub storage: PathBuf,
/// Path to templates (export formats)
pub templates: Vec<PathBuf>,
/// Limit channel items (unlimited by default)
pub list_items_limit: Option<usize>,
pub pub_date_format: String,
pub last_build_date_format: String,
pub time_generated_format: String,
}
#[derive(Debug, Deserialize)]
pub struct Config {
pub feed: Vec<Feed>,
/// Update timeout in seconds
///
/// * None to generate once
pub update: Option<u64>,
}

View file

@ -1,148 +0,0 @@
mod argument;
mod config;
use anyhow::Result;
use argument::Argument;
use chrono::{DateTime, Local};
use clap::Parser;
use config::{Config, Feed};
use log::{debug, info, warn};
use std::{
env::var,
fs::{File, create_dir_all, read_to_string},
io::Write,
path::PathBuf,
};
use strip_tags::*;
fn main() -> Result<()> {
if var("RUST_LOG").is_ok() {
use tracing_subscriber::{EnvFilter, fmt::*};
struct T;
impl time::FormatTime for T {
fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result {
write!(w, "{}", Local::now())
}
}
fmt()
.with_timer(T)
.with_env_filter(EnvFilter::from_default_env())
.init()
}
let argument = Argument::parse();
let config: Config = toml::from_str(&read_to_string(argument.config)?)?;
info!("Crawler started");
loop {
debug!("Begin new crawl queue...");
for feed in &config.feed {
debug!("Update `{}`...", feed.url);
if let Err(e) = crawl(feed) {
warn!("Feed `{}` update failed: `{e}`", feed.url)
}
}
debug!("Crawl queue completed");
if let Some(update) = config.update {
debug!("Wait {update} seconds to continue...",);
std::thread::sleep(std::time::Duration::from_secs(update))
} else {
return Ok(());
}
}
}
fn crawl(feed: &Feed) -> Result<()> {
use reqwest::blocking::get;
use rss::Channel;
let channel = Channel::read_from(&get(feed.url.as_str())?.bytes()?[..])?;
let channel_items = channel.items();
let channel_items_limit = feed.list_items_limit.unwrap_or(channel_items.len());
let regex = regex::Regex::new(r"\n{3,}").unwrap();
for template in &feed.templates {
let root = PathBuf::from(template);
let extension = root.file_name().unwrap().to_string_lossy();
let index = {
let mut p = PathBuf::from(&root);
p.push(format!("index.{extension}"));
read_to_string(p)?
};
let index_item = {
let mut p = PathBuf::from(&root);
p.push("index");
p.push(format!("item.{extension}"));
read_to_string(p)?
};
create_dir_all(&feed.storage)?;
File::create({
let mut p = PathBuf::from(&feed.storage);
p.push(format!("index.{extension}"));
p
})?
.write_all(
index
.replace("{title}", &strip_tags(channel.title()))
.replace("{description}", &strip_tags(channel.description()))
.replace("{link}", channel.link())
.replace("{language}", channel.language().unwrap_or_default())
.replace(
"{pub_date}",
&time(channel.pub_date(), &feed.pub_date_format),
)
.replace(
"{last_build_date}",
&time(channel.last_build_date(), &feed.last_build_date_format),
)
.replace("{time_generated}", &time(None, &feed.time_generated_format))
.replace(
"{items}",
&channel_items
.iter()
.take(channel_items_limit)
.map(|i| {
regex
.replace_all(
&index_item
.replace(
"{title}",
&strip_tags(i.title().unwrap_or_default()),
)
.replace(
"{description}",
&strip_tags(i.description().unwrap_or_default()),
)
.replace("{link}", i.link().unwrap_or_default())
.replace(
"{pub_date}",
&time(i.pub_date(), &feed.pub_date_format),
),
"\n\n",
)
.to_string()
})
.collect::<String>(),
)
.as_bytes(),
)?
}
Ok(())
}
fn time(value: Option<&str>, format: &str) -> String {
match value {
Some(v) => DateTime::parse_from_rfc2822(v).unwrap(),
None => Local::now().into(),
}
.format(format)
.to_string()
}

View file

@ -1,7 +0,0 @@
# {title}
{description}
## {time_generated}
{items}

View file

@ -1,6 +0,0 @@
### {title}
{description}
=> {link} {pub_date}

View file

@ -1,49 +0,0 @@
<!DOCTYPE html>
<html lang="{language}">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title}</title>
<style>
* {
color-scheme: light dark
}
body {
margin: 0 auto;
max-width: 1024px
}
header {
border-bottom: 1px #ccc dotted;
padding-bottom: 32px;
}
section > article {
border-bottom: 1px #ccc dotted;
padding-bottom: 32px;
}
footer {
font-size: small;
padding: 16px 0;
}
</style>
</head>
<body>
<header>
<h1>{title}</h1>
{description}
</header>
<section>
{items}
</section>
<footer>
<p>
Source: <a href="{link}">{title}</a> |
Updated: {pub_date} |
Build: {last_build_date} |
Generated: {time_generated}
</p>
<p>
Powered by <a href="https://github.com/YGGverse/rssto">rssto</a>.
</p>
</footer>
</body>
</html>

View file

@ -1,5 +0,0 @@
<article>
<h2>{title}</h2>
<p>{description}</p>
<a href="{link}">{pub_date}</a>
</article>