remove extra features

This commit is contained in:
yggverse 2025-08-04 21:23:55 +03:00
parent 1395257882
commit 8cbae5019d
12 changed files with 153 additions and 972 deletions

View file

@ -4,25 +4,25 @@ version = "0.2.0"
edition = "2024" edition = "2024"
license = "MIT" license = "MIT"
readme = "README.md" readme = "README.md"
description = "SSD-friendly crawler for the Aquatic BitTorrent tracker based on librqbit API" description = "SSD-friendly crawler for the Aquatic BitTorrent tracker, based on the librqbit API"
keywords = ["aquatic", "librqbit", "rqbit", "crawler", "bittorrent"] keywords = ["aquatic", "librqbit", "bittorrent", "crawler", "resolver"]
categories = ["network-programming"] categories = ["network-programming"]
repository = "https://github.com/YGGverse/aquatic-crawler" repository = "https://github.com/YGGverse/aquatic-crawler"
# homepage = "https://yggverse.github.io" # homepage = "https://yggverse.github.io"
[dependencies] [dependencies]
anyhow = "1.0" anyhow = "1.0"
chrono = "0.4.41" chrono = "0.4"
clap = { version = "4.5", features = ["derive"] } clap = { version = "4.5", features = ["derive"] }
hyper-util = "0.1"
librqbit = {version = "9.0.0-beta.1", features = ["disable-upload"]} librqbit = {version = "9.0.0-beta.1", features = ["disable-upload"]}
log = "0.4"
regex = "1.11" regex = "1.11"
tokio = { version = "1.45", features = ["full"] } tokio = { version = "1.45", features = ["full"] }
tracing-subscriber = "0.3" tracing-subscriber = "0.3"
url = "2.5" url = "2.5"
urlencoding = "2.1" urlencoding = "2.1"
voca_rs = "1.15"
walkdir = "2.5" walkdir = "2.5"
[patch.crates-io] [patch.crates-io]
librqbit = { git = "https://github.com/ikatson/rqbit.git", rev="b580a9610ae7c6eaacd305a3905f7e2d3202ca69", package = "librqbit" } librqbit = { git = "https://github.com/ikatson/rqbit.git", rev="b580a9610ae7c6eaacd305a3905f7e2d3202ca69", package = "librqbit" }
#librqbit = { version = "9.0.0-beta.1", path = "../../rqbit/crates/librqbit", package = "librqbit" } #librqbit = { version = "9.0.0-beta.1", path = "../../rqbit/crates/librqbit", package = "librqbit" }

166
README.md
View file

@ -4,37 +4,11 @@
[![Dependencies](https://deps.rs/repo/github/YGGverse/aquatic-crawler/status.svg)](https://deps.rs/repo/github/YGGverse/aquatic-crawler) [![Dependencies](https://deps.rs/repo/github/YGGverse/aquatic-crawler/status.svg)](https://deps.rs/repo/github/YGGverse/aquatic-crawler)
[![crates.io](https://img.shields.io/crates/v/aquatic-crawler.svg)](https://crates.io/crates/aquatic-crawler) [![crates.io](https://img.shields.io/crates/v/aquatic-crawler.svg)](https://crates.io/crates/aquatic-crawler)
SSD-friendly crawler for the [Aquatic](https://github.com/greatest-ape/aquatic) BitTorrent tracker based on [librqbit](https://github.com/ikatson/rqbit/tree/main/crates/librqbit) API SSD-friendly crawler for the [Aquatic](https://github.com/greatest-ape/aquatic) BitTorrent tracker, based on the [librqbit](https://github.com/ikatson/rqbit/tree/main/crates/librqbit) API
> [!NOTE] > [!NOTE]
> Compatible with any other `--infohash` source in `hash1hash2...` binary format (see also the [Online API](https://github.com/YGGverse/aquatic-crawler/wiki/Online-API)) > * requires [PR#233](https://github.com/greatest-ape/aquatic/pull/233), see the [Wiki](https://github.com/YGGverse/aquatic-crawler/wiki/Aquatic) for more details
> * compatible with any other `--infohash` source in `hash1hash2...` binary format (see also the [Online API](https://github.com/YGGverse/aquatic-crawler/wiki/Online-API))
## Conception
See the project [Wiki](https://github.com/YGGverse/aquatic-crawler/wiki)
## Features
> [!TIP]
> For details on all implemented features, see the [Options](#options) section
* Info-hash versions
* [x] 1
* [ ] 2
* Import sources
* [x] IPv4 / IPv6 info-hash binary API (requires [PR#233](https://github.com/greatest-ape/aquatic/pull/233), [Wiki](https://github.com/YGGverse/aquatic-crawler/wiki/Aquatic))
* [x] local file path
* [ ] remote URL
* Export options
* [x] Content (`--preload`)
* [x] data match the regex pattern (`--preload-regex`)
* [x] data match limits (see `--preload-*` options group)
* [x] Resolved `.torrent` files (`--export-torrents`)
* [x] RSS feed (`--export-rss`) includes resolved torrent meta and magnet links to download
* customize feed options with `--export-rss-*` options group
* [ ] [Gemtext](https://geminiprotocol.net/docs/gemtext.gmi) static files catalog
* [ ] [Manticore](https://github.com/manticoresoftware/manticoresearch-rust) full text search index
* [ ] SQLite database index
## Install ## Install
@ -53,140 +27,12 @@ aquatic-crawler --infohash /path/to/info-hash-ipv4.bin\
--infohash /path/to/another-source.bin\ --infohash /path/to/another-source.bin\
--tracker udp://host1:port\ --tracker udp://host1:port\
--tracker udp://host2:port\ --tracker udp://host2:port\
--preload /path/to/directory\ --preload /path/to/directory
--enable-tcp
``` ```
* append `RUST_LOG=debug` to debug
### Options ### Options
``` bash ``` bash
-d, --debug aquatic-crawler --help
Print debug output
--infohash <INFOHASH>
Absolute path(s) or URL(s) to import infohashes from the Aquatic tracker binary API
* PR#233 feature ([Wiki](https://github.com/YGGverse/aquatic-crawler/wiki/Aquatic))
--tracker <TRACKER>
Define custom tracker(s) to preload the `.torrent` files info
--initial-peer <INITIAL_PEER>
Define initial peer(s) to preload the `.torrent` files info
--export-torrents <EXPORT_TORRENTS>
Save resolved torrent files to given directory
--export-rss <EXPORT_RSS>
File path to export RSS feed
--export-rss-title <EXPORT_RSS_TITLE>
Custom title for RSS feed (channel)
[default: aquatic-crawler]
--export-rss-link <EXPORT_RSS_LINK>
Custom link for RSS feed (channel)
--export-rss-description <EXPORT_RSS_DESCRIPTION>
Custom description for RSS feed (channel)
--export-trackers
Appends `--tracker` value to magnets and torrents
--enable-dht
Enable DHT resolver
--enable-tcp
Enable TCP connection
--bind <BIND>
Bind resolver session on specified device name (`tun0`, `mycelium`, etc.)
--listen <LISTEN>
Bind listener on specified `host:port` (`[host]:port` for IPv6)
* this option is useful only for binding the data exchange service,
to restrict the outgoing connections for torrent resolver, use `bind` option instead
--listen-upnp
Enable UPnP forwarding
--enable-upload
Enable upload (share bytes received with BitTorrent network)
--preload <PRELOAD>
Directory path to store preloaded data (e.g. `.torrent` files)
--preload-clear
Clear previous data collected on crawl session start
--preload-regex <PRELOAD_REGEX>
Preload only files match regex pattern (list only without preload by default)
* see also `preload_max_filesize`, `preload_max_filecount` options
## Example:
Filter by image ext ``` --preload-regex '(png|gif|jpeg|jpg|webp)$' ```
* requires `storage` argument defined
--preload-total-size <PRELOAD_TOTAL_SIZE>
Stop crawler on total preload files size reached
--preload-max-filesize <PRELOAD_MAX_FILESIZE>
Max size sum of preloaded files per torrent (match `preload_regex`)
--preload-max-filecount <PRELOAD_MAX_FILECOUNT>
Max count of preloaded files per torrent (match `preload_regex`)
--proxy-url <PROXY_URL>
Use `socks5://[username:password@]host:port`
--peer-connect-timeout <PEER_CONNECT_TIMEOUT>
--peer-read-write-timeout <PEER_READ_WRITE_TIMEOUT>
--peer-keep-alive-interval <PEER_KEEP_ALIVE_INTERVAL>
--index-capacity <INDEX_CAPACITY>
Estimated info-hash index capacity
[default: 1000]
--index-list
Index torrent files
--index-list-limit <INDEX_LIST_LIMIT>
Limit torrent files quantity to index
* insert the `...` placeholder as the last item, with total size left
[default: 100]
--index-timeout <INDEX_TIMEOUT>
Remove records from index older than `seconds`
--add-torrent-timeout <ADD_TORRENT_TIMEOUT>
Max time to handle each torrent
[default: 10]
--sleep <SLEEP>
Crawl loop delay in seconds
[default: 300]
--upload-limit <UPLOAD_LIMIT>
Limit upload speed (b/s)
--download-limit <DOWNLOAD_LIMIT>
Limit download speed (b/s)
-h, --help
Print help (see a summary with '-h')
-V, --version
Print version
``` ```

View file

@ -1,12 +1,11 @@
use clap::Parser; use clap::Parser;
use regex::Regex;
use std::{net::SocketAddr, path::PathBuf};
use url::Url;
#[derive(Parser, Debug)] #[derive(Parser, Debug)]
#[command(version, about, long_about = None)] #[command(version, about, long_about = None)]
pub struct Config { pub struct Config {
/// Print debug output
#[arg(short, long, default_value_t = false)]
pub debug: bool,
/// Absolute path(s) or URL(s) to import infohashes from the Aquatic tracker binary API /// Absolute path(s) or URL(s) to import infohashes from the Aquatic tracker binary API
/// ///
/// * PR#233 feature ([Wiki](https://github.com/YGGverse/aquatic-crawler/wiki/Aquatic)) /// * PR#233 feature ([Wiki](https://github.com/YGGverse/aquatic-crawler/wiki/Aquatic))
@ -15,31 +14,11 @@ pub struct Config {
/// Define custom tracker(s) to preload the `.torrent` files info /// Define custom tracker(s) to preload the `.torrent` files info
#[arg(long)] #[arg(long)]
pub tracker: Vec<String>, pub tracker: Vec<Url>,
/// Define initial peer(s) to preload the `.torrent` files info /// Define initial peer(s) to preload the `.torrent` files info
#[arg(long)] #[arg(long)]
pub initial_peer: Vec<String>, pub initial_peer: Option<Vec<SocketAddr>>,
/// Save resolved torrent files to given directory
#[arg(long)]
pub export_torrents: Option<String>,
/// File path to export RSS feed
#[arg(long)]
pub export_rss: Option<String>,
/// Custom title for RSS feed (channel)
#[arg(long, default_value_t = String::from("aquatic-crawler"))]
pub export_rss_title: String,
/// Custom link for RSS feed (channel)
#[arg(long)]
pub export_rss_link: Option<String>,
/// Custom description for RSS feed (channel)
#[arg(long)]
pub export_rss_description: Option<String>,
/// Appends `--tracker` value to magnets and torrents /// Appends `--tracker` value to magnets and torrents
#[arg(long, default_value_t = false)] #[arg(long, default_value_t = false)]
@ -49,9 +28,9 @@ pub struct Config {
#[arg(long, default_value_t = false)] #[arg(long, default_value_t = false)]
pub enable_dht: bool, pub enable_dht: bool,
/// Enable TCP connection /// Disable TCP connection
#[arg(long, default_value_t = false)] #[arg(long, default_value_t = false)]
pub enable_tcp: bool, pub disable_tcp: bool,
/// Bind resolver session on specified device name (`tun0`, `mycelium`, etc.) /// Bind resolver session on specified device name (`tun0`, `mycelium`, etc.)
#[arg(long)] #[arg(long)]
@ -74,11 +53,7 @@ pub struct Config {
/// Directory path to store preloaded data (e.g. `.torrent` files) /// Directory path to store preloaded data (e.g. `.torrent` files)
#[arg(long)] #[arg(long)]
pub preload: Option<String>, pub preload: PathBuf,
/// Clear previous data collected on crawl session start
#[arg(long, default_value_t = false)]
pub preload_clear: bool,
/// Preload only files match regex pattern (list only without preload by default) /// Preload only files match regex pattern (list only without preload by default)
/// * see also `preload_max_filesize`, `preload_max_filecount` options /// * see also `preload_max_filesize`, `preload_max_filecount` options
@ -92,11 +67,7 @@ pub struct Config {
/// ///
/// * requires `storage` argument defined /// * requires `storage` argument defined
#[arg(long)] #[arg(long)]
pub preload_regex: Option<String>, pub preload_regex: Option<Regex>,
/// Stop crawler on total preload files size reached
#[arg(long)]
pub preload_total_size: Option<u64>,
/// Max size sum of preloaded files per torrent (match `preload_regex`) /// Max size sum of preloaded files per torrent (match `preload_regex`)
#[arg(long)] #[arg(long)]
@ -108,7 +79,7 @@ pub struct Config {
/// Use `socks5://[username:password@]host:port` /// Use `socks5://[username:password@]host:port`
#[arg(long)] #[arg(long)]
pub proxy_url: Option<String>, pub proxy_url: Option<Url>,
// Peer options // Peer options
#[arg(long)] #[arg(long)]

View file

@ -1,24 +0,0 @@
pub trait Format {
/// Format bytes to KB/MB/GB presentation
fn bytes(self) -> String;
}
impl Format for u64 {
fn bytes(self) -> String {
const KB: f32 = 1024.0;
const MB: f32 = KB * KB;
const GB: f32 = MB * KB;
let f = self as f32;
if f < KB {
format!("{self} B")
} else if f < MB {
format!("{:.2} KB", f / KB)
} else if f < GB {
format!("{:.2} MB", f / MB)
} else {
format!("{:.2} GB", f / GB)
}
}
}

View file

@ -1,111 +0,0 @@
mod value;
use chrono::{Duration, Utc};
use std::collections::HashMap;
use value::Value;
/// Collect processed info hashes to skip on the next iterations (for this session)
/// * also contains optional meta info to export index as RSS or any other format
pub struct Index {
index: HashMap<String, Value>,
/// Removes outdated values from `index` on `Self::refresh` action
timeout: Option<Duration>,
/// Track index changes to prevent extra disk write operations (safe SSD life)
/// * useful in the static RSS feed generation case, if enabled
is_changed: bool,
/// Store the index value in memory only when it is in use by the init options
has_name: bool,
has_size: bool,
has_list: bool,
}
impl Index {
pub fn init(
capacity: usize,
timeout: Option<i64>,
has_name: bool,
has_size: bool,
has_list: bool,
) -> Self {
Self {
index: HashMap::with_capacity(capacity),
timeout: timeout.map(Duration::seconds),
has_size,
has_name,
has_list,
is_changed: false,
}
}
pub fn has(&self, infohash: &str) -> bool {
self.index.contains_key(infohash)
}
pub fn is_changed(&self) -> bool {
self.is_changed
}
pub fn list(&self) -> &HashMap<String, Value> {
&self.index
}
pub fn len(&self) -> usize {
self.index.len()
}
pub fn nodes(&self) -> u64 {
self.index.values().map(|i| i.node).sum::<u64>()
}
pub fn insert(
&mut self,
infohash: String,
node: u64,
size: u64,
list: Option<Vec<(Option<String>, u64)>>,
name: Option<String>,
) {
if self
.index
.insert(
infohash,
Value::new(
node,
if self.has_size { Some(size) } else { None },
if self.has_name { name } else { None },
if self.has_list { list } else { None },
),
)
.is_none()
{
self.is_changed = true
}
}
pub fn refresh(&mut self) {
if let Some(timeout) = self.timeout {
let t = Utc::now();
self.index.retain(|_, v| t - v.time <= timeout)
}
self.is_changed = false
}
}
#[test]
fn test() {
use std::{thread::sleep, time::Duration};
// test values auto-clean by timeout
let mut i = Index::init(2, Some(3), false, false, false);
i.insert("h1".to_string(), 0, 0, None, None);
sleep(Duration::from_secs(1));
i.insert("h2".to_string(), 0, 0, None, None);
i.refresh();
assert_eq!(i.len(), 2);
sleep(Duration::from_secs(2));
i.refresh();
assert_eq!(i.len(), 1)
}

View file

@ -1,54 +0,0 @@
use chrono::{DateTime, Utc};
use voca_rs::Voca;
/// The `Index` value
pub struct Value {
pub time: DateTime<Utc>,
pub node: u64,
// Isolate by applying internal filter on value set
size: Option<u64>,
name: Option<String>,
list: Option<Vec<(Option<String>, u64)>>,
}
impl Value {
/// Create new `Self` with current timestamp
pub fn new(
node: u64,
size: Option<u64>,
name: Option<String>,
list: Option<Vec<(Option<String>, u64)>>,
) -> Self {
Self {
time: Utc::now(),
node,
size,
list: list.map(|f| f.into_iter().map(|(n, l)| (filter(n), l)).collect()),
name: filter(name),
}
}
/// Get reference to the safely constructed `name` member
pub fn name(&self) -> Option<&String> {
self.name.as_ref()
}
/// Get reference to the safely constructed files `list` member
pub fn list(&self) -> Option<&Vec<(Option<String>, u64)>> {
self.list.as_ref()
}
/// Get reference to the safely constructed `length` member
pub fn size(&self) -> Option<u64> {
self.size
}
}
/// Strip tags and bom chars, crop long strings (prevents memory pool overload)
fn filter(value: Option<String>) -> Option<String> {
value.map(|v| {
const C: usize = 125; // + 3 for `...` offset, 128 chars max @TODO optional
let s = v._strip_bom()._strip_tags();
if s.chars().count() > C {
return format!("{}...", s.chars().take(C).collect::<String>());
}
s
})
}

View file

@ -1,25 +1,15 @@
mod api; mod api;
mod config; mod config;
mod format;
mod index;
mod peers;
mod preload; mod preload;
mod rss;
mod torrent;
mod trackers;
use anyhow::Result; use anyhow::Result;
use config::Config; use config::Config;
use index::Index;
use librqbit::{ use librqbit::{
AddTorrent, AddTorrentOptions, AddTorrentResponse, ByteBufOwned, ConnectionOptions, AddTorrent, AddTorrentOptions, AddTorrentResponse, ConnectionOptions, ListenerOptions,
ListenerOptions, PeerConnectionOptions, SessionOptions, ValidatedTorrentMetaV1Info, PeerConnectionOptions, SessionOptions,
}; };
use peers::Peers; use preload::Preload;
use rss::Rss; use std::{collections::HashSet, num::NonZero, str::FromStr, time::Duration};
use std::{collections::HashSet, num::NonZero, path::PathBuf, str::FromStr, time::Duration};
use torrent::Torrent;
use trackers::Trackers;
use url::Url; use url::Url;
#[tokio::main] #[tokio::main]
@ -27,29 +17,21 @@ async fn main() -> Result<()> {
use chrono::Local; use chrono::Local;
use clap::Parser; use clap::Parser;
use tokio::time; use tokio::time;
// init debug
if std::env::var("RUST_LOG").is_ok() {
tracing_subscriber::fmt::init()
} // librqbit
// init components // init components
let time_init = Local::now(); let time_init = Local::now();
let config = Config::parse(); let config = Config::parse();
if std::env::var("RUST_LOG").is_ok() { let preload = Preload::init(
tracing_subscriber::fmt::init()
}
let peers = Peers::init(&config.initial_peer)?;
let preload = preload::init(
config.preload, config.preload,
config.preload_regex, config.preload_regex,
config.preload_max_filecount, config.preload_max_filecount,
config.preload_max_filesize, config.preload_max_filesize,
config.preload_total_size,
config.preload_clear,
)?; )?;
let trackers = Trackers::init(&config.tracker)?;
let torrent = config.export_torrents.map(|p| Torrent::init(&p).unwrap());
let session = librqbit::Session::new_with_opts( let session = librqbit::Session::new_with_opts(
match preload { preload.root().clone(),
Some(ref p) => p.path(),
None => PathBuf::new(),
},
SessionOptions { SessionOptions {
bind_device_name: config.bind, bind_device_name: config.bind,
listen: match config.listen { listen: match config.listen {
@ -66,8 +48,8 @@ async fn main() -> Result<()> {
} }
}, },
connect: Some(ConnectionOptions { connect: Some(ConnectionOptions {
enable_tcp: config.enable_tcp, enable_tcp: !config.disable_tcp,
proxy_url: config.proxy_url, proxy_url: config.proxy_url.map(|u| u.to_string()),
peer_opts: Some(PeerConnectionOptions { peer_opts: Some(PeerConnectionOptions {
connect_timeout: config.peer_connect_timeout.map(Duration::from_secs), connect_timeout: config.peer_connect_timeout.map(Duration::from_secs),
read_write_timeout: config.peer_read_write_timeout.map(Duration::from_secs), read_write_timeout: config.peer_read_write_timeout.map(Duration::from_secs),
@ -82,54 +64,35 @@ async fn main() -> Result<()> {
upload_bps: config.upload_limit.and_then(NonZero::new), upload_bps: config.upload_limit.and_then(NonZero::new),
download_bps: config.download_limit.and_then(NonZero::new), download_bps: config.download_limit.and_then(NonZero::new),
}, },
trackers: trackers.list().clone(), trackers: config.tracker.iter().cloned().collect(),
..SessionOptions::default() ..SessionOptions::default()
}, },
) )
.await?; .await?;
log::info!("Crawler started on {time_init}");
// begin
println!("Crawler started on {time_init}");
let mut index = Index::init(
config.index_capacity,
config.index_timeout,
config.export_rss.is_some(),
config.export_rss.is_some(),
config.export_rss.is_some() && config.index_list,
);
loop { loop {
let time_queue = Local::now(); let time_queue = Local::now();
if config.debug { log::debug!("Queue crawl begin on {time_queue}...");
println!("\tQueue crawl begin on {time_queue}...")
}
index.refresh();
for source in &config.infohash { for source in &config.infohash {
if config.debug { log::debug!("Index source `{source}`...");
println!("\tIndex source `{source}`...")
}
// grab latest info-hashes from this source // grab latest info-hashes from this source
// * aquatic server may update the stats at this moment, handle result manually // * aquatic server may update the stats at this moment, handle result manually
for i in match api::get(source, config.index_capacity) { for i in match api::get(source, config.index_capacity) {
Some(i) => i, Some(i) => i,
None => { None => {
// skip without panic // skip without panic
if config.debug { log::warn!(
eprintln!(
"The feed `{source}` has an incomplete format (or is still updating); skip." "The feed `{source}` has an incomplete format (or is still updating); skip."
) );
}
continue; continue;
} }
} { } {
// convert to string once // convert to string once
let i = i.to_string(); let i = i.to_string();
// is already indexed? if preload.contains_torrent(&i)? {
if index.has(&i) {
continue; continue;
} }
if config.debug { log::debug!("Index `{i}`...");
println!("\t\tIndex `{i}`...")
}
// run the crawler in single thread for performance reasons, // run the crawler in single thread for performance reasons,
// use `timeout` argument option to skip the dead connections. // use `timeout` argument option to skip the dead connections.
match time::timeout( match time::timeout(
@ -137,18 +100,18 @@ async fn main() -> Result<()> {
session.add_torrent( session.add_torrent(
AddTorrent::from_url(magnet( AddTorrent::from_url(magnet(
&i, &i,
if config.export_trackers && !trackers.is_empty() { if config.tracker.is_empty() {
Some(trackers.list())
} else {
None None
} else {
Some(config.tracker.as_ref())
}, },
)), )),
Some(AddTorrentOptions { Some(AddTorrentOptions {
paused: true, // continue after `only_files` init paused: true, // continue after `only_files` init
overwrite: true, overwrite: true,
disable_trackers: trackers.is_empty(), disable_trackers: config.tracker.is_empty(),
initial_peers: peers.initial_peers(), initial_peers: config.initial_peer.clone(),
list_only: preload.as_ref().is_none_or(|p| p.regex.is_none()), list_only: false,
// it is important to blacklist all files preload until initiation // it is important to blacklist all files preload until initiation
only_files: Some(Vec::with_capacity( only_files: Some(Vec::with_capacity(
config.preload_max_filecount.unwrap_or_default(), config.preload_max_filecount.unwrap_or_default(),
@ -156,8 +119,9 @@ async fn main() -> Result<()> {
// the destination folder to preload files match `only_files_regex` // the destination folder to preload files match `only_files_regex`
// * e.g. images for audio albums // * e.g. images for audio albums
output_folder: preload output_folder: preload
.as_ref() .output_folder(&i)?
.map(|p| p.output_folder(&i, true).unwrap()), .to_str()
.map(|s| s.to_string()),
..Default::default() ..Default::default()
}), }),
), ),
@ -167,151 +131,63 @@ async fn main() -> Result<()> {
Ok(r) => match r { Ok(r) => match r {
// on `preload_regex` case only // on `preload_regex` case only
Ok(AddTorrentResponse::Added(id, mt)) => { Ok(AddTorrentResponse::Added(id, mt)) => {
let mut only_files_size = 0; let mut keep_files = HashSet::with_capacity(
let mut only_files_keep = Vec::with_capacity(
config.preload_max_filecount.unwrap_or_default(), config.preload_max_filecount.unwrap_or_default(),
); );
let mut only_files = HashSet::with_capacity( let mut only_files = HashSet::with_capacity(
config.preload_max_filecount.unwrap_or_default(), config.preload_max_filecount.unwrap_or_default(),
); );
mt.wait_until_initialized().await?; mt.wait_until_initialized().await?;
let (name, size, list) = mt.with_metadata(|m| { let bytes = mt.with_metadata(|m| {
// init preload files list
if let Some(ref p) = preload {
for (id, info) in m.file_infos.iter().enumerate() { for (id, info) in m.file_infos.iter().enumerate() {
if p.matches(info.relative_filename.to_str().unwrap()) { if preload
if p.max_filesize.is_some_and(|limit| { .max_filecount
only_files_size + info.len > limit
}) {
if config.debug {
println!(
"\t\t\ttotal files size limit `{i}` reached!"
)
}
break;
}
if p.max_filecount
.is_some_and(|limit| only_files.len() + 1 > limit) .is_some_and(|limit| only_files.len() + 1 > limit)
{ {
if config.debug { log::debug!(
println!( "file count limit reached, skip `{id}` for `{i}`"
"\t\t\ttotal files count limit for `{i}` reached!" );
)
}
break; break;
} }
only_files_size += info.len; if preload.max_filesize.is_some_and(|limit| info.len > limit) {
if let Some(ref p) = preload { log::debug!(
only_files_keep "file size limit reached, skip `{id}` for `{i}`"
.push(p.absolute(&i, &info.relative_filename)) );
continue;
} }
only_files.insert(id); if preload.regex.as_ref().is_some_and(|r| {
!r.is_match(&info.relative_filename.to_string_lossy())
}) {
log::debug!("regex filter, skip `{id}` for `{i}`");
continue;
} }
assert!(keep_files.insert(info.relative_filename.clone()));
assert!(only_files.insert(id));
} }
} m.torrent_bytes.to_vec()
if let Some(ref t) = torrent {
save_torrent_file(t, &i, &m.torrent_bytes, config.debug)
}
(
m.info.name().as_ref().map(|n| n.to_string()),
size(&m.info),
list(&m.info, config.index_list_limit),
)
})?; })?;
session.update_only_files(&mt, &only_files).await?; session.update_only_files(&mt, &only_files).await?;
session.unpause(&mt).await?; session.unpause(&mt).await?;
// await for `preload_regex` files download to continue // await for `preload_regex` files download to continue
mt.wait_until_completed().await?; mt.wait_until_completed().await?;
// cleanup irrelevant files (see rqbit#408)
preload.cleanup(&i, Some(keep_files))?;
preload.persist_torrent_bytes(&i, &bytes)?;
// remove torrent from session as indexed // remove torrent from session as indexed
session session
.delete(librqbit::api::TorrentIdOrHash::Id(id), false) .delete(librqbit::api::TorrentIdOrHash::Id(id), false)
.await?; .await?;
// cleanup irrelevant files (see rqbit#408) log::debug!("torrent `{i}` indexed.")
if let Some(p) = &preload {
p.cleanup(&i, Some(only_files_keep))?
} }
Ok(_) => panic!(),
if config.debug { Err(e) => log::debug!("Failed to resolve `{i}`: `{e}`."),
println!("\t\t\tadd `{i}` to index.")
}
index.insert(
i,
only_files_size,
size,
list,
name.map(|n| n.to_string()),
)
}
Ok(AddTorrentResponse::ListOnly(r)) => {
if let Some(ref t) = torrent {
save_torrent_file(t, &i, &r.torrent_bytes, config.debug)
}
// @TODO
// use `r.info` for Memory, SQLite,
// Manticore and other alternative storage type
if config.debug {
println!("\t\t\tadd `{i}` to index.")
}
index.insert(
i,
0,
size(&r.info),
list(&r.info, config.index_list_limit),
r.info.name().map(|n| n.to_string()),
)
}
// unexpected as should be deleted
Ok(AddTorrentResponse::AlreadyManaged(..)) => panic!(),
Err(e) => eprintln!("Failed to resolve `{i}`: `{e}`."),
}, },
Err(e) => { Err(e) => log::debug!("failed to resolve `{i}`: `{e}`"),
if config.debug {
println!("\t\t\tfailed to resolve `{i}`: `{e}`")
} }
} }
} }
} log::debug!(
} "Queue completed at {time_queue} (time: {} / uptime: {}) await {} seconds to continue...",
if let Some(ref export_rss) = config.export_rss
&& index.is_changed()
{
let mut rss = Rss::new(
export_rss,
&config.export_rss_title,
&config.export_rss_link,
&config.export_rss_description,
if config.export_trackers && !trackers.is_empty() {
Some(trackers.list().clone())
} else {
None
},
)?;
for (k, v) in index.list() {
rss.push(
k,
v.name().unwrap_or(k),
rss::item_description(v.size(), v.list()),
Some(&v.time.to_rfc2822()),
)?
}
rss.commit()?
}
if preload
.as_ref()
.is_some_and(|p| p.total_size.is_some_and(|s| index.nodes() > s))
{
panic!("Preload content size {} bytes reached!", 0)
}
if config.debug {
println!(
"Queue completed on {time_queue}\n\ttotal: {}\n\ttime: {} s\n\tuptime: {} s\n\tawait {} seconds to continue...",
index.len(),
Local::now() Local::now()
.signed_duration_since(time_queue) .signed_duration_since(time_queue)
.as_seconds_f32(), .as_seconds_f32(),
@ -319,31 +195,15 @@ async fn main() -> Result<()> {
.signed_duration_since(time_init) .signed_duration_since(time_init)
.as_seconds_f32(), .as_seconds_f32(),
config.sleep, config.sleep,
) );
}
std::thread::sleep(Duration::from_secs(config.sleep)) std::thread::sleep(Duration::from_secs(config.sleep))
} }
} }
/// Shared handler function to save resolved torrents as file
fn save_torrent_file(t: &Torrent, i: &str, b: &[u8], d: bool) {
match t.persist(i, b) {
Ok(r) => {
if d {
match r {
Some(p) => println!("\t\t\tadd torrent file `{}`", p.to_string_lossy()),
None => println!("\t\t\ttorrent file `{i}` already exists"),
}
}
}
Err(e) => eprintln!("Error on save torrent file `{i}`: `{e}`"),
}
}
/// Build magnet URI /// Build magnet URI
fn magnet(infohash: &str, trackers: Option<&HashSet<Url>>) -> String { fn magnet(info_hash: &str, trackers: Option<&Vec<Url>>) -> String {
let mut m = if infohash.len() == 40 { let mut m = if info_hash.len() == 40 {
format!("magnet:?xt=urn:btih:{infohash}") format!("magnet:?xt=urn:btih:{info_hash}")
} else { } else {
todo!("infohash v2 is not supported by librqbit") todo!("infohash v2 is not supported by librqbit")
}; };
@ -355,62 +215,3 @@ fn magnet(infohash: &str, trackers: Option<&HashSet<Url>>) -> String {
} }
m m
} }
/// Count total size, including torrent files
fn size(meta: &ValidatedTorrentMetaV1Info<ByteBufOwned>) -> u64 {
let mut t = 0;
if let Some(l) = meta.info().length {
t += l
}
if let Some(ref files) = meta.info().files {
for f in files {
t += f.length
}
}
t
}
fn list(
meta: &ValidatedTorrentMetaV1Info<ByteBufOwned>,
limit: usize,
) -> Option<Vec<(Option<String>, u64)>> {
meta.info().files.as_ref().map(|files| {
let mut b = Vec::with_capacity(files.len());
let mut i = files.iter();
let mut t = 0;
for f in i.by_ref() {
if t < limit {
t += 1;
b.push((
String::from_utf8(
f.path
.iter()
.enumerate()
.flat_map(|(n, b)| {
if n == 0 {
b.0.to_vec()
} else {
let mut p = vec![b'/'];
p.extend(b.0.to_vec());
p
}
})
.collect(),
)
.ok(),
f.length,
));
continue;
}
// limit reached: count sizes left and use placeholder as the last item name
let mut l = 0;
for f in i.by_ref() {
l += f.length
}
b.push((Some("...".to_string()), l));
break;
}
b[..t].sort_by(|a, b| a.0.cmp(&b.0)); // @TODO optional
b
})
}

View file

@ -1,21 +0,0 @@
use std::{net::SocketAddr, str::FromStr};
pub struct Peers(Vec<SocketAddr>);
impl Peers {
pub fn init(peers: &Vec<String>) -> anyhow::Result<Self> {
let mut p = Vec::with_capacity(peers.len());
for peer in peers {
p.push(SocketAddr::from_str(peer)?);
}
Ok(Self(p))
}
pub fn initial_peers(&self) -> Option<Vec<SocketAddr>> {
if self.0.is_empty() {
None
} else {
Some(self.0.clone())
}
}
}

View file

@ -1,123 +1,90 @@
use anyhow::{Result, bail}; use anyhow::{Result, bail};
use regex::Regex; use regex::Regex;
use std::{fs, path::PathBuf, str::FromStr}; use std::{collections::HashSet, fs, path::PathBuf};
pub struct Preload { pub struct Preload {
path: PathBuf, root: PathBuf,
pub max_filecount: Option<usize>, pub max_filecount: Option<usize>,
pub max_filesize: Option<u64>, pub max_filesize: Option<u64>,
pub total_size: Option<u64>,
pub regex: Option<Regex>, pub regex: Option<Regex>,
} }
impl Preload { impl Preload {
fn init( // Constructors
directory: &str,
regex: Option<String>, pub fn init(
root: PathBuf,
regex: Option<Regex>,
max_filecount: Option<usize>, max_filecount: Option<usize>,
max_filesize: Option<u64>, max_filesize: Option<u64>,
total_size: Option<u64>,
is_clear: bool,
) -> Result<Self> { ) -> Result<Self> {
let path = PathBuf::from_str(directory)?; if !root.is_dir() {
if let Ok(t) = fs::metadata(&path) { bail!("Preload root is not directory")
if t.is_file() {
bail!("Storage destination is not directory!");
} }
if t.is_dir() && is_clear {
for i in fs::read_dir(&path)? {
let r = i?.path();
if r.is_dir() {
fs::remove_dir_all(&r)?;
} else {
fs::remove_file(&r)?;
}
}
}
}
fs::create_dir_all(&path)?;
Ok(Self { Ok(Self {
max_filecount, max_filecount,
max_filesize, max_filesize,
path, regex,
regex: regex.map(|r| Regex::new(&r).unwrap()), root: root.canonicalize()?,
total_size,
}) })
} }
pub fn output_folder(&self, infohash: &str, create: bool) -> Result<String> { // Actions
let mut p = PathBuf::new();
p.push(&self.path);
p.push(infohash);
if p.is_file() {
bail!("File destination is not directory!");
}
if create {
fs::create_dir_all(&p)?;
}
if !p.is_dir() {
bail!("Destination directory not exists!")
}
Ok(p.to_string_lossy().to_string())
}
pub fn absolute(&self, infohash: &str, file: &PathBuf) -> PathBuf {
let mut p = PathBuf::new();
p.push(&self.path);
p.push(infohash);
p.push(file);
p
}
/// Recursively remove all files under the `infohash` location (see rqbit#408) /// Recursively remove all files under the `infohash` location (see rqbit#408)
pub fn cleanup(&self, infohash: &str, keep_filenames: Option<Vec<PathBuf>>) -> Result<()> { pub fn cleanup(&self, info_hash: &str, keep_filenames: Option<HashSet<PathBuf>>) -> Result<()> {
for e in walkdir::WalkDir::new(self.output_folder(infohash, false)?) { for e in walkdir::WalkDir::new(self.output_folder(info_hash)?) {
let e = e?; let e = e?;
let p = e.into_path(); let p = e.into_path();
if p.is_file() && keep_filenames.as_ref().is_none_or(|k| !k.contains(&p)) { if p.is_file() && keep_filenames.as_ref().is_none_or(|k| !k.contains(&p)) {
fs::remove_file(p)?; fs::remove_file(p)?;
} }
} } // remove empty directories @TODO
Ok(()) Ok(())
} }
pub fn path(&self) -> PathBuf { pub fn persist_torrent_bytes(&self, info_hash: &str, contents: &[u8]) -> Result<PathBuf> {
self.path.clone() let p = self.torrent(info_hash)?;
fs::write(&p, contents)?;
Ok(p)
} }
pub fn matches(&self, pattern: &str) -> bool { // Getters
self.regex.as_ref().is_some_and(|r| r.is_match(pattern))
/// * creates new directory if not exists
pub fn output_folder(&self, info_hash: &str) -> Result<PathBuf> {
if !is_info_hash(info_hash) {
bail!("Invalid info-hash `{info_hash}`")
}
let mut p = PathBuf::from(&self.root);
p.push(info_hash);
if p.is_file() {
bail!("Output directory for info-hash `{info_hash}` is file")
}
if !p.exists() {
fs::create_dir(&p)?
}
Ok(p)
}
pub fn root(&self) -> &PathBuf {
&self.root
}
pub fn contains_torrent(&self, info_hash: &str) -> Result<bool> {
Ok(fs::exists(self.torrent(info_hash)?)?)
}
fn torrent(&self, info_hash: &str) -> Result<PathBuf> {
if !is_info_hash(info_hash) {
bail!("Invalid info-hash `{info_hash}`")
}
let mut p = PathBuf::from(&self.root);
p.push(format!("{info_hash}.torrent"));
Ok(p)
} }
} }
/// Init `Preload` with validate related argument options fn is_info_hash(value: &str) -> bool {
pub fn init( value.len() == 40 && value.chars().all(|c| c.is_ascii_hexdigit())
path: Option<String>,
regex: Option<String>,
max_filecount: Option<usize>,
max_filesize: Option<u64>,
total_size: Option<u64>,
is_clear: bool,
) -> Result<Option<Preload>> {
Ok(match path {
Some(ref p) => Some(Preload::init(
p,
regex,
max_filecount,
max_filesize,
total_size,
is_clear,
)?),
None => {
if regex.is_some()
|| max_filecount.is_some()
|| max_filesize.is_some()
|| total_size.is_some()
|| is_clear
{
bail!("`--preload` directory is required for this configuration!")
}
None
}
})
} }

View file

@ -1,142 +0,0 @@
use anyhow::{Result, bail};
use std::{collections::HashSet, fs::File, io::Write, path::PathBuf, str::FromStr};
use url::Url;
/// Export crawl index to the RSS file
pub struct Rss {
/// Resulting (public) file in the XML format
file: File,
/// Shared directory for the feed `file` and its `tmp` buffer file
target: PathBuf,
/// Creates temporary file to exclude feed format damage on update
tmp: PathBuf,
/// Trackers source for every item in channel
trackers: Option<HashSet<Url>>,
}
impl Rss {
/// Create writable file for given `filepath`
pub fn new(
filepath: &str,
title: &str,
link: &Option<String>,
description: &Option<String>,
trackers: Option<HashSet<Url>>,
) -> Result<Self> {
// prevent from reading of the incomplete file
let tmp = PathBuf::from_str(&format!("{filepath}.tmp"))?;
// init public destination
let target = PathBuf::from_str(filepath)?;
if target.is_dir() {
bail!("RSS path `{}` is directory", target.to_string_lossy())
}
// init temporary file to write
let mut file = File::create(&tmp)?;
file.write_all(
b"<?xml version=\"1.0\" encoding=\"UTF-8\"?><rss version=\"2.0\"><channel>",
)?;
let t = chrono::Utc::now().to_rfc2822();
file.write_all(b"<pubDate>")?;
file.write_all(t.as_bytes())?;
file.write_all(b"</pubDate>")?;
file.write_all(b"<lastBuildDate>")?;
file.write_all(t.as_bytes())?;
file.write_all(b"</lastBuildDate>")?;
file.write_all(b"<title>")?;
file.write_all(escape(title).as_bytes())?;
file.write_all(b"</title>")?;
if let Some(s) = description {
file.write_all(b"<description>")?;
file.write_all(escape(s).as_bytes())?;
file.write_all(b"</description>")?
}
if let Some(s) = link {
file.write_all(b"<link>")?;
file.write_all(escape(s).as_bytes())?;
file.write_all(b"</link>")?
}
Ok(Self {
file,
target,
trackers,
tmp,
})
}
/// Append `item` to the feed `channel`
pub fn push(
&mut self,
infohash: &str,
title: &str,
description: Option<String>,
pub_date: Option<&str>,
) -> Result<()> {
self.file.write_all(
format!(
"<item><guid>{infohash}</guid><title>{}</title><link>{}</link>",
escape(title),
escape(&crate::magnet(infohash, self.trackers.as_ref()))
)
.as_bytes(),
)?;
if let Some(s) = description {
self.file.write_all(b"<description>")?;
self.file.write_all(escape(&s).as_bytes())?;
self.file.write_all(b"</description>")?
}
if let Some(s) = pub_date {
self.file.write_all(b"<pubDate>")?;
self.file.write_all(escape(s).as_bytes())?;
self.file.write_all(b"</pubDate>")?
}
self.file.write_all(b"</item>")?;
Ok(())
}
/// Write final bytes, replace public file with temporary one
pub fn commit(mut self) -> Result<()> {
self.file.write_all(b"</channel></rss>")?;
std::fs::rename(self.tmp, self.target)?;
Ok(())
}
}
pub fn item_description(
size: Option<u64>,
list: Option<&Vec<(Option<String>, u64)>>,
) -> Option<String> {
use crate::format::Format;
if size.is_none() && list.is_none() {
return None;
}
let mut b = Vec::with_capacity(list.map(|l| l.len()).unwrap_or_default() + 1);
if let Some(s) = size {
b.push(s.bytes())
}
if let Some(l) = list {
for (path, size) in l {
b.push(format!(
"{} ({})",
path.as_deref().unwrap_or("?"), // @TODO invalid encoding
size.bytes()
))
}
}
Some(b.join("\n"))
}
fn escape(subject: &str) -> String {
subject
.replace('&', "&amp;")
.replace('<', "&lt;")
.replace('>', "&gt;")
.replace('"', "&quot;")
.replace("'", "&apos;")
}

View file

@ -1,32 +0,0 @@
use anyhow::Result;
use std::{fs, io::Write, path::PathBuf, str::FromStr};
pub struct Torrent {
storage: PathBuf,
}
impl Torrent {
pub fn init(path: &str) -> Result<Self> {
Ok(Self {
storage: PathBuf::from_str(path)?.canonicalize()?,
})
}
pub fn persist(&self, infohash: &str, data: &[u8]) -> Result<Option<PathBuf>> {
Ok(if self.path(infohash).exists() {
None
} else {
let p = self.path(infohash);
let mut f = fs::File::create(&p)?;
f.write_all(data)?;
Some(p)
})
}
fn path(&self, infohash: &str) -> PathBuf {
let mut p = PathBuf::new();
p.push(&self.storage);
p.push(format!("{infohash}.torrent"));
p
}
}

View file

@ -1,20 +0,0 @@
use std::{collections::HashSet, str::FromStr};
use url::Url;
pub struct Trackers(HashSet<Url>);
impl Trackers {
pub fn init(trackers: &Vec<String>) -> anyhow::Result<Self> {
let mut t = HashSet::with_capacity(trackers.len());
for tracker in trackers {
t.insert(Url::from_str(tracker)?);
}
Ok(Self(t))
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn list(&self) -> &HashSet<Url> {
&self.0
}
}