implement some 0.2.0 features

This commit is contained in:
yggverse 2025-06-14 23:33:11 +03:00
parent 18116491cb
commit 10c2d7e855
11 changed files with 319 additions and 157 deletions

View file

@ -11,15 +11,15 @@ repository = "https://github.com/YGGverse/aquatic-crawler"
# homepage = "https://yggverse.github.io" # homepage = "https://yggverse.github.io"
[dependencies] [dependencies]
anyhow = "1.0.98" anyhow = "1.0"
clap = { version = "4.5", features = ["derive"] } clap = { version = "4.5", features = ["derive"] }
hyper-util = "0.1.14" hyper-util = "0.1"
librqbit = {version = "9.0.0-beta.0", features = ["disable-upload"]} librqbit = {version = "8.1", features = ["disable-upload"]}
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0" serde_json = "1.0"
tokio = { version = "1.45", features = ["full"] } tokio = { version = "1.45", features = ["full"] }
tracing-subscriber = "0.3" tracing-subscriber = "0.3"
url = "2.5.4" url = "2.5"
[patch.crates-io] [patch.crates-io]
librqbit = { git = "https://github.com/ikatson/rqbit.git", branch = "tracker-udp-dualstack", package = "librqbit" } librqbit = { version = "9.0.0-beta.0", git = "https://github.com/ikatson/rqbit.git", package = "librqbit", features = ["disable-upload"] }
#librqbit = { path = "../../rqbit/crates/librqbit", package = "librqbit" } #librqbit = { version = "9.0.0-beta.0", path = "../../rqbit/crates/librqbit", package = "librqbit" }

View file

@ -47,7 +47,7 @@ aquatic-crawler --infohash-file /path/to/info-hash-ipv4.json\
-d, --debug <DEBUG> -d, --debug <DEBUG>
Debug level Debug level
* `e` - error * `i` - info * `t` - trace (e.g. to run with `RUST_LOG=librqbit=trace`) * `e` - error * `i` - info * `t` - trace (run with `RUST_LOG=librqbit=trace`)
[default: ei] [default: ei]
@ -60,7 +60,7 @@ aquatic-crawler --infohash-file /path/to/info-hash-ipv4.json\
* PR#233 feature * PR#233 feature
--storage <STORAGE> --storage <STORAGE>
Directory path to store reload data (e.g. `.torrent` files) Directory path to store preloaded data (e.g. `.torrent` files)
--torrent-tracker <TORRENT_TRACKER> --torrent-tracker <TORRENT_TRACKER>
Define custom tracker(s) to preload the `.torrent` files info Define custom tracker(s) to preload the `.torrent` files info
@ -71,14 +71,38 @@ aquatic-crawler --infohash-file /path/to/info-hash-ipv4.json\
--enable-dht --enable-dht
Enable DHT resolver Enable DHT resolver
--enable-upnp-port-forwarding
Enable UPnP
--enable-upload --enable-upload
Enable upload Enable upload
--preload-regex <PRELOAD_REGEX>
Preload files match regex pattern (list only without preload by default)
## Example:
Filter by image ext ``` --preload-regex '\.(png|gif|jpeg|webp)$' ```
* requires `storage` argument defined
--save-torrents
Save resolved torrent files to the `storage` location
--socks-proxy-url <SOCKS_PROXY_URL>
Use `socks5://[username:password@]host:port`
-s <SLEEP> -s <SLEEP>
Crawl loop delay in seconds Crawl loop delay in seconds
[default: 300] [default: 300]
--upload-limit <UPLOAD_LIMIT>
Limit upload speed (b/s)
--download-limit <DOWNLOAD_LIMIT>
Limit download speed (b/s)
-h, --help -h, --help
Print help (see a summary with '-h') Print help (see a summary with '-h')

View file

@ -7,7 +7,7 @@ pub struct Argument {
/// ///
/// * `e` - error /// * `e` - error
/// * `i` - info /// * `i` - info
/// * `t` - trace (e.g. to run with `RUST_LOG=librqbit=trace`) /// * `t` - trace (run with `RUST_LOG=librqbit=trace`)
#[arg(short, long, default_value_t = String::from("ei"))] #[arg(short, long, default_value_t = String::from("ei"))]
pub debug: String, pub debug: String,
@ -21,9 +21,9 @@ pub struct Argument {
#[arg(long)] #[arg(long)]
pub infohash_file: Vec<String>, pub infohash_file: Vec<String>,
/// Directory path to store reload data (e.g. `.torrent` files) /// Directory path to store preloaded data (e.g. `.torrent` files)
#[arg(long)] #[arg(long)]
pub storage: Option<String>, pub storage: String,
/// Define custom tracker(s) to preload the `.torrent` files info /// Define custom tracker(s) to preload the `.torrent` files info
#[arg(long)] #[arg(long)]
@ -45,6 +45,23 @@ pub struct Argument {
#[arg(long, default_value_t = false)] #[arg(long, default_value_t = false)]
pub enable_upload: bool, pub enable_upload: bool,
/// Preload files match regex pattern (list only without preload by default)
///
/// ## Example:
///
/// Filter by image ext
/// ```
/// --preload-regex '\.(png|gif|jpeg|webp)$'
/// ```
///
/// * requires `storage` argument defined
#[arg(long)]
pub preload_regex: Option<String>,
/// Save resolved torrent files to the `storage` location
#[arg(long, default_value_t = true)]
pub save_torrents: bool,
/// Use `socks5://[username:password@]host:port` /// Use `socks5://[username:password@]host:port`
#[arg(long)] #[arg(long)]
pub socks_proxy_url: Option<String>, pub socks_proxy_url: Option<String>,
@ -52,4 +69,12 @@ pub struct Argument {
/// Crawl loop delay in seconds /// Crawl loop delay in seconds
#[arg(short, default_value_t = 300)] #[arg(short, default_value_t = 300)]
pub sleep: u64, pub sleep: u64,
/// Limit upload speed (b/s)
#[arg(long)]
pub upload_limit: Option<u32>,
/// Limit download speed (b/s)
#[arg(long)]
pub download_limit: Option<u32>,
} }

View file

@ -1 +0,0 @@
pub mod torrent;

View file

@ -1,38 +0,0 @@
use anyhow::{Result, bail};
use std::{fs, io::Write, path::PathBuf, str::FromStr};
pub struct Storage(PathBuf);
impl Storage {
pub fn init(storage: &str, clear: bool) -> Result<Self> {
let p = PathBuf::from_str(storage)?;
if let Ok(t) = fs::metadata(&p) {
if t.is_file() {
bail!("Target destination is not directory!")
}
if t.is_dir() && clear {
fs::remove_dir_all(&p)?;
}
}
fs::create_dir_all(&p)?;
Ok(Self(p))
}
pub fn exists(&self, infohash: &str) -> bool {
fs::metadata(self.filename(infohash)).is_ok_and(|p| p.is_file())
}
pub fn save(&self, infohash: &str, bytes: &[u8]) -> Result<PathBuf> {
let p = self.filename(infohash);
let mut f = fs::File::create(&p)?;
f.write_all(bytes)?;
Ok(p)
}
fn filename(&self, infohash: &str) -> PathBuf {
let mut p = PathBuf::new();
p.push(&self.0);
p.push(format!("{infohash}.torrent"));
p
}
}

View file

@ -1,9 +1,28 @@
pub fn error(e: &anyhow::Error) { mod level;
eprintln!("[{}] [error] {e}", now()) use level::Level;
}
pub fn info(message: String) { pub struct Debug(Vec<Level>);
println!("[{}] [info] {message}", now())
impl Debug {
pub fn init(levels: &str) -> anyhow::Result<Self> {
let mut l = Vec::with_capacity(levels.len());
for s in levels.to_lowercase().chars() {
l.push(Level::parse(s)?);
}
Ok(Self(l))
}
pub fn error(&self, message: &str) {
if self.0.contains(&Level::Error) {
eprintln!("[{}] [error] {message}", now());
}
}
pub fn info(&self, message: &str) {
if self.0.contains(&Level::Info) {
println!("[{}] [info] {message}", now());
}
}
} }
fn now() -> u128 { fn now() -> u128 {

22
src/debug/level.rs Normal file
View file

@ -0,0 +1,22 @@
use anyhow::{Result, bail};
#[derive(PartialEq)]
pub enum Level {
Error,
Info,
Trace,
}
impl Level {
pub fn parse(value: char) -> Result<Self> {
match value {
'e' => Ok(Self::Error),
'i' => Ok(Self::Info),
't' => {
tracing_subscriber::fmt::init();
Ok(Self::Trace)
}
_ => bail!("Unsupported debug value `{value}`!"),
}
}
}

View file

@ -1,132 +1,141 @@
mod api; mod api;
mod argument; mod argument;
mod database;
mod debug; mod debug;
mod peers;
mod storage;
mod trackers;
use anyhow::Result;
use debug::Debug;
use storage::Storage;
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> Result<()> {
use clap::Parser; use clap::Parser;
use librqbit::SessionOptions; use librqbit::{AddTorrent, AddTorrentOptions, AddTorrentResponse, SessionOptions};
use std::str::FromStr; use std::{num::NonZero, time::Duration};
let argument = argument::Argument::parse(); // init components
let arg = argument::Argument::parse();
// calculate debug level once let debug = Debug::init(&arg.debug)?;
let is_debug_i = argument.debug.contains("i"); let peers = peers::Peers::init(&arg.initial_peer)?;
let is_debug_e = argument.debug.contains("e"); let storage = Storage::init(&arg.storage, arg.clear)?;
let trackers = trackers::Trackers::init(&arg.torrent_tracker)?;
if argument.debug.contains("t") {
tracing_subscriber::fmt::init()
}
// init shared members
let torrent_storage = if let Some(t) = argument.storage {
let s = database::torrent::Storage::init(&t, argument.clear)?;
if argument.clear && is_debug_i {
debug::info(String::from("Cleanup torrent storage"));
}
Some(s)
} else {
None
};
let mut trackers = std::collections::HashSet::with_capacity(argument.torrent_tracker.len());
for tracker in argument.torrent_tracker {
trackers.insert(url::Url::from_str(&tracker)?);
}
let mut peers = Vec::with_capacity(argument.initial_peer.len());
for peer in argument.initial_peer {
peers.push(std::net::SocketAddr::from_str(&peer)?);
}
// begin
if is_debug_i {
debug::info(String::from("Crawler started"));
}
loop {
if is_debug_i {
debug::info(String::from("New index session begin..."));
}
let mut total = 0;
let session = librqbit::Session::new_with_opts( let session = librqbit::Session::new_with_opts(
std::path::PathBuf::new(), storage.path(),
SessionOptions { SessionOptions {
disable_dht: !argument.enable_dht, disable_upload: !arg.enable_upload,
disable_upload: !argument.enable_upload, disable_dht: !arg.enable_dht,
disable_dht_persistence: true,
persistence: None, persistence: None,
ratelimits: librqbit::limits::LimitsConfig {
upload_bps: arg.upload_limit.and_then(NonZero::new),
download_bps: arg.download_limit.and_then(NonZero::new),
},
trackers: trackers.clone(), trackers: trackers.clone(),
..SessionOptions::default() ..SessionOptions::default()
}, },
) )
.await?; .await?;
// collect info-hashes from API
for source in &argument.infohash_file {
if is_debug_i {
debug::info(format!("Handle info-hash source `{source}`..."));
}
// begin
debug.info("Crawler started");
loop {
debug.info("Index queue begin...");
let mut total = 0;
// collect info-hashes from each API channel
for source in &arg.infohash_file {
debug.info(&format!("Handle info-hash source `{source}`..."));
// aquatic server may update the stats at this moment, // aquatic server may update the stats at this moment,
// handle this state manually // handle this state manually
match api::infohashes(source) { match api::infohashes(source) {
Ok(infohashes) => { Ok(infohashes) => {
total += infohashes.len(); total += infohashes.len();
for i in infohashes { for i in infohashes {
if torrent_storage.as_ref().is_some_and(|s| !s.exists(&i)) { debug.info(&format!("Index `{i}`..."));
if is_debug_i {
debug::info(format!("Resolve `{i}`..."));
}
match session match session
.add_torrent( .add_torrent(
librqbit::AddTorrent::from_url(format!( AddTorrent::from_url(format!("magnet:?xt=urn:btih:{i}")),
"magnet:?xt=urn:btih:{i}" Some(AddTorrentOptions {
)), overwrite: true,
Some(librqbit::AddTorrentOptions {
disable_trackers: trackers.is_empty(), disable_trackers: trackers.is_empty(),
initial_peers: if peers.is_empty() { initial_peers: if peers.is_empty() {
None None
} else { } else {
Some(peers.clone()) Some(peers.clone())
}, },
list_only: true, // preload nothing, but listing when regex pattern argument is given
list_only: arg.preload_regex.is_none(),
// this option allows rqbit manager to preload some or any files match pattern
// * useful to build index with multimedia files, like images for audio albums
output_folder: storage.output_folder(&i).ok(),
// applies preload some files to the destination directory (above)
only_files_regex: arg.preload_regex.clone(),
..Default::default() ..Default::default()
}), }),
) )
.await? .await
{ {
librqbit::AddTorrentResponse::ListOnly(r) => { Ok(r) => match r {
if let Some(ref s) = torrent_storage { AddTorrentResponse::AlreadyManaged(_, t)
let p = s.save(&i, &r.torrent_bytes)?; | AddTorrentResponse::Added(_, t) => {
if is_debug_i { if arg.save_torrents {
debug::info(format!( t.with_metadata(|m| {
"Add new torrent file `{}`", save_torrent_file(
p.to_string_lossy() &storage,
)); &debug,
&i,
&m.torrent_bytes,
)
})?;
} }
/*tokio::spawn({
let t = t.clone();
let d = Duration::from_secs(5);
async move {
loop {
let s = t.stats();
if s.finished {
break;
}
debug.info(&format!("{s}..."));
tokio::time::sleep(d).await;
}
}
});*/
// @TODO t.wait_until_completed().await?;
}
AddTorrentResponse::ListOnly(r) => {
if arg.save_torrents {
save_torrent_file(&storage, &debug, &i, &r.torrent_bytes)
} }
// @TODO // @TODO
// use `r.info` for Memory, SQLite, Manticore and other alternative storage type // use `r.info` for Memory, SQLite, Manticore and other alternative storage type
} }
_ => panic!(), },
Err(e) => debug.info(&format!("Torrent handle skipped: `{e}`")),
} }
} }
} }
} Err(e) => debug.error(&e.to_string()),
Err(ref e) => {
if is_debug_e {
debug::error(e)
} }
} }
} debug.info(&format!(
}
session.stop().await;
if is_debug_i {
debug::info(format!(
"Index of {total} hashes completed, await {} seconds to continue...", "Index of {total} hashes completed, await {} seconds to continue...",
argument.sleep, arg.sleep,
)); ));
} std::thread::sleep(Duration::from_secs(arg.sleep));
std::thread::sleep(std::time::Duration::from_secs(argument.sleep)); }
}
fn save_torrent_file(s: &Storage, d: &Debug, i: &str, b: &[u8]) {
if s.torrent_exists(i) {
d.info(&format!("Torrent file `{i}` already exists, skip"))
} else {
match s.save_torrent(i, b) {
Ok(r) => d.info(&format!("Add torrent file `{}`", r.to_string_lossy())),
Err(e) => d.error(&e.to_string()),
}
} }
} }

21
src/peers.rs Normal file
View file

@ -0,0 +1,21 @@
use std::{net::SocketAddr, str::FromStr};
pub struct Peers(Vec<SocketAddr>);
impl Peers {
pub fn init(peers: &Vec<String>) -> anyhow::Result<Self> {
let mut p = Vec::with_capacity(peers.len());
for peer in peers {
p.push(SocketAddr::from_str(peer)?);
}
Ok(Self(p))
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn clone(&self) -> Vec<SocketAddr> {
self.0.clone()
}
}

61
src/storage.rs Normal file
View file

@ -0,0 +1,61 @@
use anyhow::{Result, bail};
use std::{fs, io::Write, path::PathBuf, str::FromStr};
pub struct Storage(PathBuf);
impl Storage {
pub fn init(storage: &str, clear: bool) -> Result<Self> {
let p = PathBuf::from_str(storage)?;
if let Ok(t) = fs::metadata(&p) {
if t.is_file() {
bail!("Storage destination is not directory!");
}
if t.is_dir() && clear {
for i in fs::read_dir(&p)? {
let r = i?.path();
if r.is_dir() {
fs::remove_dir_all(&r)?;
} else {
fs::remove_file(&r)?;
}
}
}
}
fs::create_dir_all(&p)?;
Ok(Self(p))
}
pub fn torrent_exists(&self, infohash: &str) -> bool {
fs::metadata(self.torrent(infohash))
.is_ok_and(|p| p.is_file() || p.is_dir() || p.is_symlink())
}
pub fn save_torrent(&self, infohash: &str, bytes: &[u8]) -> Result<PathBuf> {
let p = self.torrent(infohash);
let mut f = fs::File::create(&p)?;
f.write_all(bytes)?;
Ok(p)
}
pub fn output_folder(&self, infohash: &str) -> Result<String> {
let mut p = PathBuf::new();
p.push(&self.0);
p.push(infohash);
if p.is_file() {
bail!("File destination is not directory!");
}
fs::create_dir_all(&p)?;
Ok(p.to_string_lossy().to_string())
}
pub fn path(&self) -> PathBuf {
self.0.clone()
}
fn torrent(&self, infohash: &str) -> PathBuf {
let mut p = PathBuf::new();
p.push(&self.0);
p.push(format!("{infohash}.torrent"));
p
}
}

20
src/trackers.rs Normal file
View file

@ -0,0 +1,20 @@
use std::{collections::HashSet, str::FromStr};
use url::Url;
pub struct Trackers(HashSet<Url>);
impl Trackers {
pub fn init(trackers: &Vec<String>) -> anyhow::Result<Self> {
let mut t = HashSet::with_capacity(trackers.len());
for tracker in trackers {
t.insert(Url::from_str(tracker)?);
}
Ok(Self(t))
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn clone(&self) -> HashSet<Url> {
self.0.clone()
}
}