mirror of
https://github.com/YGGverse/rssto.git
synced 2026-04-02 10:05:32 +00:00
initial commit
This commit is contained in:
parent
8dfc595961
commit
e070db316c
19 changed files with 400 additions and 356 deletions
23
crates/crawler/Cargo.toml
Normal file
23
crates/crawler/Cargo.toml
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
[package]
|
||||
name = "rssto-crawler"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
license = "MIT"
|
||||
readme = "README.md"
|
||||
description = "Crawl RSS feeds into MySQL database"
|
||||
keywords = ["rss", "aggregator", "conversion", "mysql", "crawler"]
|
||||
categories = ["command-line-utilities", "parsing", "text-processing", "value-formatting"]
|
||||
repository = "https://github.com/YGGverse/rssto"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.100"
|
||||
chrono = "0.4.42"
|
||||
clap = { version = "4.5.54", features = ["derive"] }
|
||||
log = "0.4.29"
|
||||
mysql = { package = "rssto-mysql", version = "0.1.0", path = "../mysql" }
|
||||
reqwest = { version = "0.13.1", features = ["blocking"] }
|
||||
rss = "2.0.12"
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
toml = "0.9.10"
|
||||
tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
|
||||
url = { version = "2.5.8", features = ["serde"] }
|
||||
21
crates/crawler/LICENSE
Normal file
21
crates/crawler/LICENSE
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2026 YGGverse
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
20
crates/crawler/config/example.toml
Normal file
20
crates/crawler/config/example.toml
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
update = 60
|
||||
|
||||
[mysql]
|
||||
host = "localhost"
|
||||
port = 3307
|
||||
user = ""
|
||||
password = ""
|
||||
database = "rssto"
|
||||
|
||||
[[channel]]
|
||||
url = "https://"
|
||||
items_limit = 20
|
||||
persist_item_title = true
|
||||
persist_item_description = true
|
||||
|
||||
[[channel]]
|
||||
url = "https://"
|
||||
items_limit = 20
|
||||
persist_item_title = true
|
||||
persist_item_description = true
|
||||
12
crates/crawler/src/argument.rs
Normal file
12
crates/crawler/src/argument.rs
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
use clap::Parser;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about, long_about = None)]
|
||||
pub struct Argument {
|
||||
/// Path to config file
|
||||
///
|
||||
/// * see `config/example.toml`
|
||||
#[arg(short, long)]
|
||||
pub config: PathBuf,
|
||||
}
|
||||
33
crates/crawler/src/config.rs
Normal file
33
crates/crawler/src/config.rs
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
use serde::Deserialize;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Mysql {
|
||||
pub database: String,
|
||||
pub host: String,
|
||||
pub password: String,
|
||||
pub port: u16,
|
||||
pub user: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Channel {
|
||||
/// RSS feed source
|
||||
pub url: Url,
|
||||
/// Limit channel items (unlimited by default)
|
||||
pub items_limit: Option<usize>,
|
||||
/// Save item title
|
||||
pub persist_item_title: bool,
|
||||
/// Save item description
|
||||
pub persist_item_description: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Config {
|
||||
pub mysql: Mysql,
|
||||
pub channel: Vec<Channel>,
|
||||
/// Update timeout in seconds
|
||||
///
|
||||
/// * None to generate once
|
||||
pub update: Option<u64>,
|
||||
}
|
||||
124
crates/crawler/src/main.rs
Normal file
124
crates/crawler/src/main.rs
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
mod argument;
|
||||
mod config;
|
||||
|
||||
use anyhow::Result;
|
||||
use log::{debug, info, warn};
|
||||
use mysql::Mysql;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
use argument::Argument;
|
||||
use chrono::Local;
|
||||
use clap::Parser;
|
||||
use std::{env::var, fs::read_to_string};
|
||||
|
||||
if var("RUST_LOG").is_ok() {
|
||||
use tracing_subscriber::{EnvFilter, fmt::*};
|
||||
struct T;
|
||||
impl time::FormatTime for T {
|
||||
fn format_time(&self, w: &mut format::Writer<'_>) -> std::fmt::Result {
|
||||
write!(w, "{}", Local::now())
|
||||
}
|
||||
}
|
||||
fmt()
|
||||
.with_timer(T)
|
||||
.with_env_filter(EnvFilter::from_default_env())
|
||||
.init()
|
||||
}
|
||||
|
||||
let argument = Argument::parse();
|
||||
let config: config::Config = toml::from_str(&read_to_string(argument.config)?)?;
|
||||
|
||||
let mut database = Mysql::connect(
|
||||
&config.mysql.host,
|
||||
config.mysql.port,
|
||||
&config.mysql.user,
|
||||
&config.mysql.password,
|
||||
&config.mysql.database,
|
||||
)?;
|
||||
|
||||
info!("Crawler started");
|
||||
loop {
|
||||
debug!("Begin new crawl queue...");
|
||||
for feed in &config.channel {
|
||||
debug!("Update `{}`...", feed.url);
|
||||
if let Err(e) = crawl(&mut database, feed) {
|
||||
warn!("Feed `{}` update failed: `{e}`", feed.url)
|
||||
}
|
||||
}
|
||||
debug!("Crawl queue completed");
|
||||
if let Some(update) = config.update {
|
||||
debug!("Wait {update} seconds to continue...",);
|
||||
std::thread::sleep(std::time::Duration::from_secs(update))
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> {
|
||||
use reqwest::blocking::get;
|
||||
use rss::Channel;
|
||||
|
||||
let channel = Channel::read_from(&get(channel_config.url.as_str())?.bytes()?[..])?;
|
||||
let channel_items = channel.items();
|
||||
let channel_items_limit = channel_config.items_limit.unwrap_or(channel_items.len());
|
||||
|
||||
let feed_url = channel_config.url.to_string();
|
||||
let channel_id = match db.channels_by_url(&feed_url, Some(1))?.first() {
|
||||
Some(result) => result.channel_id,
|
||||
None => db.insert_channel(&feed_url)?,
|
||||
};
|
||||
|
||||
for channel_item in channel_items.iter().take(channel_items_limit) {
|
||||
let guid = match channel_item.guid {
|
||||
Some(ref guid) => guid.value.clone(),
|
||||
None => {
|
||||
warn!("Undefined `guid` field in `{feed_url}`");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let link = match channel_item.guid {
|
||||
Some(ref link) => link.value.clone(),
|
||||
None => {
|
||||
warn!("Undefined `link` field in `{feed_url}`");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let pub_date = match channel_item.pub_date {
|
||||
Some(ref pub_date) => match chrono::DateTime::parse_from_rfc2822(pub_date) {
|
||||
Ok(t) => t.timestamp(),
|
||||
Err(e) => {
|
||||
warn!("Invalid `pub_date` field in `{feed_url}`: `{e}`");
|
||||
continue;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
warn!("Undefined `pub_date` field in `{feed_url}`");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let channel_item_id = match db
|
||||
.channel_items_by_channel_id_guid(channel_id, &guid, Some(1))?
|
||||
.first()
|
||||
{
|
||||
Some(result) => result.channel_item_id,
|
||||
None => db.insert_channel_item(
|
||||
channel_id,
|
||||
pub_date,
|
||||
&guid,
|
||||
&link,
|
||||
if channel_config.persist_item_title {
|
||||
channel_item.title()
|
||||
} else {
|
||||
None
|
||||
},
|
||||
if channel_config.persist_item_description {
|
||||
channel_item.description()
|
||||
} else {
|
||||
None
|
||||
},
|
||||
)?,
|
||||
}; // @TODO
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue