mirror of
https://github.com/YGGverse/rssto.git
synced 2026-03-31 17:15:29 +00:00
initial commit
This commit is contained in:
parent
8dfc595961
commit
e070db316c
19 changed files with 400 additions and 356 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -1,3 +1,2 @@
|
||||||
/public
|
|
||||||
/target
|
/target
|
||||||
Cargo.lock
|
Cargo.lock
|
||||||
30
Cargo.toml
30
Cargo.toml
|
|
@ -1,24 +1,6 @@
|
||||||
[package]
|
[workspace]
|
||||||
name = "rssto"
|
resolver = "2"
|
||||||
version = "0.2.2"
|
members = [
|
||||||
edition = "2024"
|
"crates/crawler",
|
||||||
license = "MIT"
|
"crates/mysql",
|
||||||
readme = "README.md"
|
]
|
||||||
description = "Convert RSS feeds into multiple formats"
|
|
||||||
keywords = ["rss", "aggregator", "conversion", "html", "gemtext"]
|
|
||||||
categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"]
|
|
||||||
repository = "https://github.com/YGGverse/rssto"
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
anyhow = "1.0"
|
|
||||||
chrono = "^0.4.20"
|
|
||||||
clap = { version = "4.5", features = ["derive"] }
|
|
||||||
log = "0.4"
|
|
||||||
regex = "1.12"
|
|
||||||
reqwest = { version = "0.12", features = ["blocking"] }
|
|
||||||
rss = "2.0"
|
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
|
||||||
strip-tags = "0.1"
|
|
||||||
toml = "0.9"
|
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
||||||
url = "2.5"
|
|
||||||
67
README.md
67
README.md
|
|
@ -6,68 +6,5 @@
|
||||||
|
|
||||||
Convert RSS feeds into multiple formats
|
Convert RSS feeds into multiple formats
|
||||||
|
|
||||||
## Features
|
> [!NOTE]
|
||||||
|
> Branch in development!
|
||||||
* [x] Multiple feed sources with flexible TOML config options
|
|
||||||
* [x] Limit channel items
|
|
||||||
* [x] Format time
|
|
||||||
* [x] Multiple export format definition
|
|
||||||
* [x] Custom templates
|
|
||||||
* [x] Single export or daemon mode with update time
|
|
||||||
* [x] Export formats:
|
|
||||||
* [x] HTML
|
|
||||||
* [x] [Gemtext](https://geminiprotocol.net/docs/gemtext.gmi)
|
|
||||||
|
|
||||||
## Install
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
cargo install rssto
|
|
||||||
```
|
|
||||||
|
|
||||||
## Launch
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
rssto -c config/example.toml
|
|
||||||
```
|
|
||||||
> [!TIP]
|
|
||||||
> * prepend `RUST_LOG=DEBUG` to print worker details (supported [levels](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.LevelFilter.html))
|
|
||||||
> * append `-u TIME` to run as the daemon with `TIME` interval update
|
|
||||||
> * see `rssto --help` to print all available options
|
|
||||||
|
|
||||||
### Systemd
|
|
||||||
|
|
||||||
1. Install `rssto` by copy the binary compiled into the native system apps destination:
|
|
||||||
* Linux: `sudo install /home/user/.cargo/bin/rssto /usr/local/bin/rssto`
|
|
||||||
2. Create `systemd` configuration file at `/etc/systemd/system/rssto.service`:
|
|
||||||
|
|
||||||
``` rssto.service
|
|
||||||
[Unit]
|
|
||||||
After=network-online.target
|
|
||||||
Wants=network-online.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
|
|
||||||
User=rssto
|
|
||||||
Group=rssto
|
|
||||||
|
|
||||||
# Uncomment for debug
|
|
||||||
# Environment="RUST_LOG=DEBUG"
|
|
||||||
# Environment="NO_COLOR=1"
|
|
||||||
|
|
||||||
ExecStart=/usr/local/bin/rssto -c /path/to/config.toml
|
|
||||||
|
|
||||||
StandardOutput=file:///home/rssto/debug.log
|
|
||||||
StandardError=file:///home/rssto/error.log
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
```
|
|
||||||
* example above requires new system user (`useradd -m rssto`)
|
|
||||||
|
|
||||||
3. Run in priority:
|
|
||||||
|
|
||||||
* `systemctl daemon-reload` - reload systemd configuration
|
|
||||||
* `systemctl enable rssto` - enable new service
|
|
||||||
* `systemctl start rssto` - start the process
|
|
||||||
* `systemctl status rssto` - check process launched
|
|
||||||
|
|
@ -1,19 +0,0 @@
|
||||||
update = 60
|
|
||||||
|
|
||||||
[[feed]]
|
|
||||||
url = "https://assets.censor.net/rss/censor.net/rss_uk_news.xml"
|
|
||||||
storage = "./public/censor.net/rss_uk_news"
|
|
||||||
templates = ["./template/html","./template/gmi"]
|
|
||||||
list_items_limit = 20
|
|
||||||
pub_date_format = "%Y/%m/%d %H:%M:%S %z"
|
|
||||||
last_build_date_format = "%Y/%m/%d %H:%M:%S %z"
|
|
||||||
time_generated_format = "%Y/%m/%d %H:%M:%S %z"
|
|
||||||
|
|
||||||
[[feed]]
|
|
||||||
url = "https://assets.censor.net/rss/censor.net/rss_uk_resonance.xml"
|
|
||||||
storage = "./public/censor.net/rss_uk_resonance"
|
|
||||||
templates = ["./template/html","./template/gmi"]
|
|
||||||
list_items_limit = 20
|
|
||||||
pub_date_format = "%Y/%m/%d %H:%M:%S %z"
|
|
||||||
last_build_date_format = "%Y/%m/%d %H:%M:%S %z"
|
|
||||||
time_generated_format = "%Y/%m/%d %H:%M:%S %z"
|
|
||||||
23
crates/crawler/Cargo.toml
Normal file
23
crates/crawler/Cargo.toml
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
[package]
|
||||||
|
name = "rssto-crawler"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
license = "MIT"
|
||||||
|
readme = "README.md"
|
||||||
|
description = "Crawl RSS feeds into MySQL database"
|
||||||
|
keywords = ["rss", "aggregator", "conversion", "mysql", "crawler"]
|
||||||
|
categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"]
|
||||||
|
repository = "https://github.com/YGGverse/rssto"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow = "1.0.100"
|
||||||
|
chrono = "0.4.42"
|
||||||
|
clap = { version = "4.5.54", features = ["derive"] }
|
||||||
|
log = "0.4.29"
|
||||||
|
mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" }
|
||||||
|
reqwest = { version = "0.13.1", features = ["blocking"] }
|
||||||
|
rss = "2.0.12"
|
||||||
|
serde = { version = "1.0.228", features = ["derive"] }
|
||||||
|
toml = "0.9.10"
|
||||||
|
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
|
||||||
|
url = { version = "2.5.8", features = ["serde"] }
|
||||||
21
crates/crawler/LICENSE
Normal file
21
crates/crawler/LICENSE
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2026 YGGverse
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
20
crates/crawler/config/example.toml
Normal file
20
crates/crawler/config/example.toml
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
update = 60
|
||||||
|
|
||||||
|
[mysql]
|
||||||
|
host = "localhost"
|
||||||
|
port = 3307
|
||||||
|
user = ""
|
||||||
|
password = ""
|
||||||
|
database = "rssto"
|
||||||
|
|
||||||
|
[[channel]]
|
||||||
|
url = "https://"
|
||||||
|
items_limit = 20
|
||||||
|
persist_item_title = true
|
||||||
|
persist_item_description = true
|
||||||
|
|
||||||
|
[[channel]]
|
||||||
|
url = "https://"
|
||||||
|
items_limit = 20
|
||||||
|
persist_item_title = true
|
||||||
|
persist_item_description = true
|
||||||
33
crates/crawler/src/config.rs
Normal file
33
crates/crawler/src/config.rs
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
use serde::Deserialize;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Mysql {
|
||||||
|
pub database: String,
|
||||||
|
pub host: String,
|
||||||
|
pub password: String,
|
||||||
|
pub port: u16,
|
||||||
|
pub user: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Channel {
|
||||||
|
/// RSS feed source
|
||||||
|
pub url: Url,
|
||||||
|
/// Limit channel items (unlimited by default)
|
||||||
|
pub items_limit: Option<usize>,
|
||||||
|
/// Save item title
|
||||||
|
pub persist_item_title: bool,
|
||||||
|
/// Save item description
|
||||||
|
pub persist_item_description: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Config {
|
||||||
|
pub mysql: Mysql,
|
||||||
|
pub channel: Vec<Channel>,
|
||||||
|
/// Update timeout in seconds
|
||||||
|
///
|
||||||
|
/// * None to generate once
|
||||||
|
pub update: Option<u64>,
|
||||||
|
}
|
||||||
124
crates/crawler/src/main.rs
Normal file
124
crates/crawler/src/main.rs
Normal file
|
|
@ -0,0 +1,124 @@
|
||||||
|
mod argument;
|
||||||
|
mod config;
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use log::{debug, info, warn};
|
||||||
|
use mysql::Mysql;
|
||||||
|
|
||||||
|
fn main() -> Result<()> {
|
||||||
|
use argument::Argument;
|
||||||
|
use chrono::Local;
|
||||||
|
use clap::Parser;
|
||||||
|
use std::{env::var, fs::read_to_string};
|
||||||
|
|
||||||
|
if var("RUST_LOG").is_ok() {
|
||||||
|
use tracing_subscriber::{EnvFilter, fmt::*};
|
||||||
|
struct T;
|
||||||
|
impl time::FormatTime for T {
|
||||||
|
fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result {
|
||||||
|
write!(w, "{}", Local::now())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt()
|
||||||
|
.with_timer(T)
|
||||||
|
.with_env_filter(EnvFilter::from_default_env())
|
||||||
|
.init()
|
||||||
|
}
|
||||||
|
|
||||||
|
let argument = Argument::parse();
|
||||||
|
let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?;
|
||||||
|
|
||||||
|
let mut database = Mysql::connect(
|
||||||
|
&config.mysql.host,
|
||||||
|
config.mysql.port,
|
||||||
|
&config.mysql.user,
|
||||||
|
&config.mysql.password,
|
||||||
|
&config.mysql.database,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
info!("Crawler started");
|
||||||
|
loop {
|
||||||
|
debug!("Begin new crawl queue...");
|
||||||
|
for feed in &config.channel {
|
||||||
|
debug!("Update `{}`...", feed.url);
|
||||||
|
if let Err(e) = crawl(&mut database, feed) {
|
||||||
|
warn!("Feed `{}` update failed: `{e}`", feed.url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug!("Crawl queue completed");
|
||||||
|
if let Some(update) = config.update {
|
||||||
|
debug!("Wait {update} seconds to continue...",);
|
||||||
|
std::thread::sleep(std::time::Duration::from_secs(update))
|
||||||
|
} else {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
|
||||||
|
use reqwest::blocking::get;
|
||||||
|
use rss::Channel;
|
||||||
|
|
||||||
|
let channel = Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..])?;
|
||||||
|
let channel_items = channel.items();
|
||||||
|
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
|
||||||
|
|
||||||
|
let feed_url = channel_config.url.to_string();
|
||||||
|
let channel_id = match db.channels_by_url(&feed_url, Some(1))?.first() {
|
||||||
|
Some(result) => result.channel_id,
|
||||||
|
None => db.insert_channel(&feed_url)?,
|
||||||
|
};
|
||||||
|
|
||||||
|
for channel_item in channel_items.iter().take(channel_items_limit) {
|
||||||
|
let guid = match channel_item.guid {
|
||||||
|
Some(ref guid) => guid.value.clone(),
|
||||||
|
None => {
|
||||||
|
warn!("Undefined `guid` field in `{feed_url}`");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let link = match channel_item.guid {
|
||||||
|
Some(ref link) => link.value.clone(),
|
||||||
|
None => {
|
||||||
|
warn!("Undefined `link` field in `{feed_url}`");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let pub_date = match channel_item.pub_date {
|
||||||
|
Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
|
||||||
|
Ok(t) => t.timestamp(),
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Invalid `pub_date` field in `{feed_url}`: `{e}`");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
warn!("Undefined `pub_date` field in `{feed_url}`");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let channel_item_id = match db
|
||||||
|
.channel_items_by_channel_id_guid(channel_id, &guid, Some(1))?
|
||||||
|
.first()
|
||||||
|
{
|
||||||
|
Some(result) => result.channel_item_id,
|
||||||
|
None => db.insert_channel_item(
|
||||||
|
channel_id,
|
||||||
|
pub_date,
|
||||||
|
&guid,
|
||||||
|
&link,
|
||||||
|
if channel_config.persist_item_title {
|
||||||
|
channel_item.title()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
},
|
||||||
|
if channel_config.persist_item_description {
|
||||||
|
channel_item.description()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
},
|
||||||
|
)?,
|
||||||
|
}; // @TODO
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
13
crates/mysql/Cargo.toml
Normal file
13
crates/mysql/Cargo.toml
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
[package]
|
||||||
|
name = "rssto-mysql"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
license = "MIT"
|
||||||
|
readme = "README.md"
|
||||||
|
description = "Shared MySQL database library"
|
||||||
|
keywords = ["rssto", "database", "mysql", "library", "driver", "api"]
|
||||||
|
# categories = []
|
||||||
|
repository = "https://github.com/YGGverse/rssto"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
mysql = "26.0.1"
|
||||||
21
crates/mysql/LICENSE
Normal file
21
crates/mysql/LICENSE
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2026 YGGverse
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
137
crates/mysql/src/lib.rs
Normal file
137
crates/mysql/src/lib.rs
Normal file
|
|
@ -0,0 +1,137 @@
|
||||||
|
use mysql::{Error, PooledConn, prelude::Queryable};
|
||||||
|
|
||||||
|
pub struct Mysql {
|
||||||
|
connection: PooledConn,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Mysql {
|
||||||
|
pub fn connect(
|
||||||
|
host: &str,
|
||||||
|
port: u16,
|
||||||
|
user: &str,
|
||||||
|
password: &str,
|
||||||
|
database: &str,
|
||||||
|
) -> Result<Self, Error> {
|
||||||
|
Ok(Self {
|
||||||
|
connection: mysql::Pool::new(
|
||||||
|
format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(),
|
||||||
|
)?
|
||||||
|
.get_conn()?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn channels_by_url(
|
||||||
|
&mut self,
|
||||||
|
url: &str,
|
||||||
|
limit: Option<usize>,
|
||||||
|
) -> Result<Vec<Channel>, Error> {
|
||||||
|
self.connection.exec_map(
|
||||||
|
format!(
|
||||||
|
"SELECT `channel_id`, `url` FROM `channel` WHERE `url` = ? LIMIT {}",
|
||||||
|
limit.unwrap_or(DEFAULT_LIMIT)
|
||||||
|
),
|
||||||
|
(url,),
|
||||||
|
|(channel_id, url)| Channel { channel_id, url },
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert_channel(&mut self, url: &str) -> Result<u64, Error> {
|
||||||
|
self.connection
|
||||||
|
.exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?;
|
||||||
|
|
||||||
|
Ok(self.connection.last_insert_id())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn channel_items_by_channel_id_guid(
|
||||||
|
&mut self,
|
||||||
|
channel_id: u64,
|
||||||
|
guid: &str,
|
||||||
|
limit: Option<usize>,
|
||||||
|
) -> Result<Vec<ChannelItem>, Error> {
|
||||||
|
self.connection.exec_map(
|
||||||
|
format!(
|
||||||
|
"SELECT `channel_item_id`, `channel_id`, `guid`, `link`, `title`, `description` FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ? LIMIT {}",
|
||||||
|
limit.unwrap_or(DEFAULT_LIMIT)),
|
||||||
|
(
|
||||||
|
channel_id,
|
||||||
|
guid
|
||||||
|
),
|
||||||
|
|(
|
||||||
|
channel_item_id,
|
||||||
|
channel_id,
|
||||||
|
pub_date,
|
||||||
|
guid,
|
||||||
|
link,
|
||||||
|
title,
|
||||||
|
description
|
||||||
|
)|
|
||||||
|
ChannelItem {
|
||||||
|
channel_item_id,
|
||||||
|
channel_id,
|
||||||
|
pub_date,
|
||||||
|
guid,
|
||||||
|
link,
|
||||||
|
title,
|
||||||
|
description
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert_channel_item(
|
||||||
|
&mut self,
|
||||||
|
channel_id: u64,
|
||||||
|
pub_date: i64,
|
||||||
|
guid: &str,
|
||||||
|
link: &str,
|
||||||
|
title: Option<&str>,
|
||||||
|
description: Option<&str>,
|
||||||
|
) -> Result<u64, Error> {
|
||||||
|
self.connection.exec_drop(
|
||||||
|
"INSERT INTO `channel_item` SET `channel_id` = ?, `pub_date` = ?, `guid` = ?, `link` = ?, `title` = ?, `description` = ?",
|
||||||
|
(channel_id, pub_date, guid, link, title, description),
|
||||||
|
)?;
|
||||||
|
Ok(self.connection.last_insert_id())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert_content(
|
||||||
|
&mut self,
|
||||||
|
channel_item_id: u64,
|
||||||
|
source_id: Option<u64>,
|
||||||
|
title: &str,
|
||||||
|
description: &str,
|
||||||
|
) -> Result<(), Error> {
|
||||||
|
self.connection.exec_drop(
|
||||||
|
"INSERT INTO `content` SET `channel_item_id` = ?, `source_id` = ?, `title` = ?, `description` = ?",
|
||||||
|
(channel_item_id, source_id, title, description ),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
pub struct Channel {
|
||||||
|
pub channel_id: u64,
|
||||||
|
pub url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
pub struct ChannelItem {
|
||||||
|
pub channel_item_id: u64,
|
||||||
|
pub channel_id: u64,
|
||||||
|
pub pub_date: i32,
|
||||||
|
pub guid: String,
|
||||||
|
pub link: String,
|
||||||
|
pub title: Option<String>,
|
||||||
|
pub description: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
pub struct Content {
|
||||||
|
pub channel_item_id: u64,
|
||||||
|
/// None if the original `title` and `description` values
|
||||||
|
/// parsed from the channel item on crawl
|
||||||
|
pub source_id: Option<u64>,
|
||||||
|
pub title: String,
|
||||||
|
pub description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
const DEFAULT_LIMIT: usize = 100;
|
||||||
|
|
@ -1,32 +0,0 @@
|
||||||
use serde::Deserialize;
|
|
||||||
use std::path::PathBuf;
|
|
||||||
use url::Url;
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
pub struct Feed {
|
|
||||||
/// RSS feed source
|
|
||||||
pub url: Url,
|
|
||||||
|
|
||||||
/// Destination directory
|
|
||||||
pub storage: PathBuf,
|
|
||||||
|
|
||||||
/// Path to templates (export formats)
|
|
||||||
pub templates: Vec<PathBuf>,
|
|
||||||
|
|
||||||
/// Limit channel items (unlimited by default)
|
|
||||||
pub list_items_limit: Option<usize>,
|
|
||||||
|
|
||||||
pub pub_date_format: String,
|
|
||||||
pub last_build_date_format: String,
|
|
||||||
pub time_generated_format: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
|
||||||
pub struct Config {
|
|
||||||
pub feed: Vec<Feed>,
|
|
||||||
|
|
||||||
/// Update timeout in seconds
|
|
||||||
///
|
|
||||||
/// * None to generate once
|
|
||||||
pub update: Option<u64>,
|
|
||||||
}
|
|
||||||
148
src/main.rs
148
src/main.rs
|
|
@ -1,148 +0,0 @@
|
||||||
mod argument;
|
|
||||||
mod config;
|
|
||||||
|
|
||||||
use anyhow::Result;
|
|
||||||
use argument::Argument;
|
|
||||||
use chrono::{DateTime, Local};
|
|
||||||
use clap::Parser;
|
|
||||||
use config::{Config, Feed};
|
|
||||||
use log::{debug, info, warn};
|
|
||||||
use std::{
|
|
||||||
env::var,
|
|
||||||
fs::{File, create_dir_all, read_to_string},
|
|
||||||
io::Write,
|
|
||||||
path::PathBuf,
|
|
||||||
};
|
|
||||||
use strip_tags::*;
|
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
|
||||||
if var("RUST_LOG").is_ok() {
|
|
||||||
use tracing_subscriber::{EnvFilter, fmt::*};
|
|
||||||
struct T;
|
|
||||||
impl time::FormatTime for T {
|
|
||||||
fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result {
|
|
||||||
write!(w, "{}", Local::now())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fmt()
|
|
||||||
.with_timer(T)
|
|
||||||
.with_env_filter(EnvFilter::from_default_env())
|
|
||||||
.init()
|
|
||||||
}
|
|
||||||
|
|
||||||
let argument = Argument::parse();
|
|
||||||
let config: Config = toml::from_str(&read_to_string(argument.config)?)?;
|
|
||||||
|
|
||||||
info!("Crawler started");
|
|
||||||
|
|
||||||
loop {
|
|
||||||
debug!("Begin new crawl queue...");
|
|
||||||
|
|
||||||
for feed in &config.feed {
|
|
||||||
debug!("Update `{}`...", feed.url);
|
|
||||||
if let Err(e) = crawl(feed) {
|
|
||||||
warn!("Feed `{}` update failed: `{e}`", feed.url)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("Crawl queue completed");
|
|
||||||
|
|
||||||
if let Some(update) = config.update {
|
|
||||||
debug!("Wait {update} seconds to continue...",);
|
|
||||||
std::thread::sleep(std::time::Duration::from_secs(update))
|
|
||||||
} else {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn crawl(feed: &Feed) -> Result<()> {
|
|
||||||
use reqwest::blocking::get;
|
|
||||||
use rss::Channel;
|
|
||||||
|
|
||||||
let channel = Channel::read_from(&get(feed.url.as_str())?.bytes()?[..])?;
|
|
||||||
let channel_items = channel.items();
|
|
||||||
let channel_items_limit = feed.list_items_limit.unwrap_or(channel_items.len());
|
|
||||||
let regex = regex::Regex::new(r"\n{3,}").unwrap();
|
|
||||||
|
|
||||||
for template in &feed.templates {
|
|
||||||
let root = PathBuf::from(template);
|
|
||||||
let extension = root.file_name().unwrap().to_string_lossy();
|
|
||||||
|
|
||||||
let index = {
|
|
||||||
let mut p = PathBuf::from(&root);
|
|
||||||
p.push(format!("index.{extension}"));
|
|
||||||
read_to_string(p)?
|
|
||||||
};
|
|
||||||
|
|
||||||
let index_item = {
|
|
||||||
let mut p = PathBuf::from(&root);
|
|
||||||
p.push("index");
|
|
||||||
p.push(format!("item.{extension}"));
|
|
||||||
read_to_string(p)?
|
|
||||||
};
|
|
||||||
|
|
||||||
create_dir_all(&feed.storage)?;
|
|
||||||
File::create({
|
|
||||||
let mut p = PathBuf::from(&feed.storage);
|
|
||||||
p.push(format!("index.{extension}"));
|
|
||||||
p
|
|
||||||
})?
|
|
||||||
.write_all(
|
|
||||||
index
|
|
||||||
.replace("{title}", &strip_tags(channel.title()))
|
|
||||||
.replace("{description}", &strip_tags(channel.description()))
|
|
||||||
.replace("{link}", channel.link())
|
|
||||||
.replace("{language}", channel.language().unwrap_or_default())
|
|
||||||
.replace(
|
|
||||||
"{pub_date}",
|
|
||||||
&time(channel.pub_date(), &feed.pub_date_format),
|
|
||||||
)
|
|
||||||
.replace(
|
|
||||||
"{last_build_date}",
|
|
||||||
&time(channel.last_build_date(), &feed.last_build_date_format),
|
|
||||||
)
|
|
||||||
.replace("{time_generated}", &time(None, &feed.time_generated_format))
|
|
||||||
.replace(
|
|
||||||
"{items}",
|
|
||||||
&channel_items
|
|
||||||
.iter()
|
|
||||||
.take(channel_items_limit)
|
|
||||||
.map(|i| {
|
|
||||||
regex
|
|
||||||
.replace_all(
|
|
||||||
&index_item
|
|
||||||
.replace(
|
|
||||||
"{title}",
|
|
||||||
&strip_tags(i.title().unwrap_or_default()),
|
|
||||||
)
|
|
||||||
.replace(
|
|
||||||
"{description}",
|
|
||||||
&strip_tags(i.description().unwrap_or_default()),
|
|
||||||
)
|
|
||||||
.replace("{link}", i.link().unwrap_or_default())
|
|
||||||
.replace(
|
|
||||||
"{pub_date}",
|
|
||||||
&time(i.pub_date(), &feed.pub_date_format),
|
|
||||||
),
|
|
||||||
"\n\n",
|
|
||||||
)
|
|
||||||
.to_string()
|
|
||||||
})
|
|
||||||
.collect::<String>(),
|
|
||||||
)
|
|
||||||
.as_bytes(),
|
|
||||||
)?
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn time(value: Option<&str>, format: &str) -> String {
|
|
||||||
match value {
|
|
||||||
Some(v) => DateTime::parse_from_rfc2822(v).unwrap(),
|
|
||||||
None => Local::now().into(),
|
|
||||||
}
|
|
||||||
.format(format)
|
|
||||||
.to_string()
|
|
||||||
}
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
# {title}
|
|
||||||
|
|
||||||
{description}
|
|
||||||
|
|
||||||
## {time_generated}
|
|
||||||
|
|
||||||
{items}
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
|
|
||||||
### {title}
|
|
||||||
|
|
||||||
{description}
|
|
||||||
|
|
||||||
=> {link} {pub_date}
|
|
||||||
|
|
@ -1,49 +0,0 @@
|
||||||
<!DOCTYPE html>
|
|
||||||
<html lang="{language}">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
<title>{title}</title>
|
|
||||||
<style>
|
|
||||||
* {
|
|
||||||
color-scheme: light dark
|
|
||||||
}
|
|
||||||
body {
|
|
||||||
margin: 0 auto;
|
|
||||||
max-width: 1024px
|
|
||||||
}
|
|
||||||
header {
|
|
||||||
border-bottom: 1px #ccc dotted;
|
|
||||||
padding-bottom: 32px;
|
|
||||||
}
|
|
||||||
section > article {
|
|
||||||
border-bottom: 1px #ccc dotted;
|
|
||||||
padding-bottom: 32px;
|
|
||||||
}
|
|
||||||
footer {
|
|
||||||
font-size: small;
|
|
||||||
padding: 16px 0;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<header>
|
|
||||||
<h1>{title}</h1>
|
|
||||||
{description}
|
|
||||||
</header>
|
|
||||||
<section>
|
|
||||||
{items}
|
|
||||||
</section>
|
|
||||||
<footer>
|
|
||||||
<p>
|
|
||||||
Source: <a href="{link}">{title}</a> |
|
|
||||||
Updated: {pub_date} |
|
|
||||||
Build: {last_build_date} |
|
|
||||||
Generated: {time_generated}
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
Powered by <a href="https://github.com/YGGverse/rssto">rssto</a>.
|
|
||||||
</p>
|
|
||||||
</footer>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
<article>
|
|
||||||
<h2>{title}</h2>
|
|
||||||
<p>{description}</p>
|
|
||||||
<a href="{link}">{pub_date}</a>
|
|
||||||
</article>
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue