implement preload_total_size argument

This commit is contained in:
yggverse 2025-06-16 19:15:20 +03:00
parent cb377425a7
commit c206a06c25
3 changed files with 22 additions and 10 deletions

View file

@ -94,6 +94,9 @@ aquatic-crawler --infohash-file /path/to/info-hash-ipv4.json\
* requires `storage` argument defined * requires `storage` argument defined
--preload-total-size <PRELOAD_TOTAL_SIZE>
Stop crawler on total preload files size reached
--preload-max-filesize <PRELOAD_MAX_FILESIZE> --preload-max-filesize <PRELOAD_MAX_FILESIZE>
Max size sum of preloaded files per torrent (match `preload_regex`) Max size sum of preloaded files per torrent (match `preload_regex`)
@ -125,11 +128,6 @@ aquatic-crawler --infohash-file /path/to/info-hash-ipv4.json\
[default: 10] [default: 10]
--download-torrent-timeout <DOWNLOAD_TORRENT_TIMEOUT>
Max time to download each torrent
[default: 10]
--sleep <SLEEP> --sleep <SLEEP>
Crawl loop delay in seconds Crawl loop delay in seconds

View file

@ -63,6 +63,10 @@ pub struct Argument {
#[arg(long)] #[arg(long)]
pub preload_regex: Option<String>, pub preload_regex: Option<String>,
/// Stop crawler on total preload files size reached
#[arg(long)]
pub preload_total_size: Option<u64>,
/// Max size sum of preloaded files per torrent (match `preload_regex`) /// Max size sum of preloaded files per torrent (match `preload_regex`)
#[arg(long)] #[arg(long)]
pub preload_max_filesize: Option<u64>, pub preload_max_filesize: Option<u64>,

View file

@ -16,7 +16,11 @@ async fn main() -> Result<()> {
AddTorrent, AddTorrentOptions, AddTorrentResponse, ConnectionOptions, AddTorrent, AddTorrentOptions, AddTorrentResponse, ConnectionOptions,
PeerConnectionOptions, SessionOptions, PeerConnectionOptions, SessionOptions,
}; };
use std::{collections::HashSet, num::NonZero, time::Duration}; use std::{
collections::{HashMap, HashSet},
num::NonZero,
time::Duration,
};
use tokio::time; use tokio::time;
// init components // init components
@ -56,7 +60,7 @@ async fn main() -> Result<()> {
debug.info("Crawler started"); debug.info("Crawler started");
// collect processed info hashes to skip on the next iterations (for this session) // collect processed info hashes to skip on the next iterations (for this session)
let mut index = HashSet::with_capacity(arg.index_capacity); let mut index = HashMap::with_capacity(arg.index_capacity);
loop { loop {
debug.info("Index queue begin..."); debug.info("Index queue begin...");
for source in &arg.infohash_file { for source in &arg.infohash_file {
@ -67,7 +71,7 @@ async fn main() -> Result<()> {
Ok(infohashes) => { Ok(infohashes) => {
for i in infohashes { for i in infohashes {
// is already indexed? // is already indexed?
if index.contains(&i) { if index.contains_key(&i) {
continue; continue;
} }
debug.info(&format!("Index `{i}`...")); debug.info(&format!("Index `{i}`..."));
@ -159,7 +163,7 @@ async fn main() -> Result<()> {
// cleanup irrelevant files (see rqbit#408) // cleanup irrelevant files (see rqbit#408)
storage.cleanup(&i, Some(only_files_keep))?; storage.cleanup(&i, Some(only_files_keep))?;
// ignore on the next crawl iterations for this session // ignore on the next crawl iterations for this session
index.insert(i); index.insert(i, only_files_size);
} }
Ok(AddTorrentResponse::ListOnly(r)) => { Ok(AddTorrentResponse::ListOnly(r)) => {
if arg.save_torrents { if arg.save_torrents {
@ -170,7 +174,7 @@ async fn main() -> Result<()> {
// Manticore and other alternative storage type // Manticore and other alternative storage type
// ignore on the next crawl iterations for this session // ignore on the next crawl iterations for this session
index.insert(i); index.insert(i, 0);
} }
// unexpected as should be deleted // unexpected as should be deleted
Ok(AddTorrentResponse::AlreadyManaged(..)) => panic!(), Ok(AddTorrentResponse::AlreadyManaged(..)) => panic!(),
@ -183,6 +187,12 @@ async fn main() -> Result<()> {
Err(e) => debug.error(&format!("API issue for `{source}`: `{e}`")), Err(e) => debug.error(&format!("API issue for `{source}`: `{e}`")),
} }
} }
if arg
.preload_total_size
.is_some_and(|s| index.values().sum::<u64>() > s)
{
panic!("Preload content size {} bytes reached!", 0)
}
debug.info(&format!( debug.info(&format!(
"Index completed, {} total, await {} seconds to continue...", "Index completed, {} total, await {} seconds to continue...",
index.len(), index.len(),