diff --git a/README.md b/README.md index b56793d..7c7914d 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,9 @@ aquatic-crawler --infohash-file /path/to/info-hash-ipv4.json\ * requires `storage` argument defined +--preload-total-size + Stop crawler on total preload files size reached + --preload-max-filesize Max size sum of preloaded files per torrent (match `preload_regex`) @@ -125,11 +128,6 @@ aquatic-crawler --infohash-file /path/to/info-hash-ipv4.json\ [default: 10] ---download-torrent-timeout - Max time to download each torrent - - [default: 10] - --sleep Crawl loop delay in seconds diff --git a/src/argument.rs b/src/argument.rs index ebc4f23..ec81af4 100644 --- a/src/argument.rs +++ b/src/argument.rs @@ -63,6 +63,10 @@ pub struct Argument { #[arg(long)] pub preload_regex: Option, + /// Stop crawler on total preload files size reached + #[arg(long)] + pub preload_total_size: Option, + /// Max size sum of preloaded files per torrent (match `preload_regex`) #[arg(long)] pub preload_max_filesize: Option, diff --git a/src/main.rs b/src/main.rs index 65a4265..85bef64 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,7 +16,11 @@ async fn main() -> Result<()> { AddTorrent, AddTorrentOptions, AddTorrentResponse, ConnectionOptions, PeerConnectionOptions, SessionOptions, }; - use std::{collections::HashSet, num::NonZero, time::Duration}; + use std::{ + collections::{HashMap, HashSet}, + num::NonZero, + time::Duration, + }; use tokio::time; // init components @@ -56,7 +60,7 @@ async fn main() -> Result<()> { debug.info("Crawler started"); // collect processed info hashes to skip on the next iterations (for this session) - let mut index = HashSet::with_capacity(arg.index_capacity); + let mut index = HashMap::with_capacity(arg.index_capacity); loop { debug.info("Index queue begin..."); for source in &arg.infohash_file { @@ -67,7 +71,7 @@ async fn main() -> Result<()> { Ok(infohashes) => { for i in infohashes { // is already indexed? - if index.contains(&i) { + if index.contains_key(&i) { continue; } debug.info(&format!("Index `{i}`...")); @@ -159,7 +163,7 @@ async fn main() -> Result<()> { // cleanup irrelevant files (see rqbit#408) storage.cleanup(&i, Some(only_files_keep))?; // ignore on the next crawl iterations for this session - index.insert(i); + index.insert(i, only_files_size); } Ok(AddTorrentResponse::ListOnly(r)) => { if arg.save_torrents { @@ -170,7 +174,7 @@ async fn main() -> Result<()> { // Manticore and other alternative storage type // ignore on the next crawl iterations for this session - index.insert(i); + index.insert(i, 0); } // unexpected as should be deleted Ok(AddTorrentResponse::AlreadyManaged(..)) => panic!(), @@ -183,6 +187,12 @@ async fn main() -> Result<()> { Err(e) => debug.error(&format!("API issue for `{source}`: `{e}`")), } } + if arg + .preload_total_size + .is_some_and(|s| index.values().sum::() > s) + { + panic!("Preload content size {} bytes reached!", 0) + } debug.info(&format!( "Index completed, {} total, await {} seconds to continue...", index.len(),