mirror of
https://github.com/YGGverse/aquatic-crawler.git
synced 2026-03-31 17:15:35 +00:00
implement preload_total_size argument
This commit is contained in:
parent
cb377425a7
commit
c206a06c25
3 changed files with 22 additions and 10 deletions
|
|
@ -94,6 +94,9 @@ aquatic-crawler --infohash-file /path/to/info-hash-ipv4.json\
|
||||||
|
|
||||||
* requires `storage` argument defined
|
* requires `storage` argument defined
|
||||||
|
|
||||||
|
--preload-total-size <PRELOAD_TOTAL_SIZE>
|
||||||
|
Stop crawler on total preload files size reached
|
||||||
|
|
||||||
--preload-max-filesize <PRELOAD_MAX_FILESIZE>
|
--preload-max-filesize <PRELOAD_MAX_FILESIZE>
|
||||||
Max size sum of preloaded files per torrent (match `preload_regex`)
|
Max size sum of preloaded files per torrent (match `preload_regex`)
|
||||||
|
|
||||||
|
|
@ -125,11 +128,6 @@ aquatic-crawler --infohash-file /path/to/info-hash-ipv4.json\
|
||||||
|
|
||||||
[default: 10]
|
[default: 10]
|
||||||
|
|
||||||
--download-torrent-timeout <DOWNLOAD_TORRENT_TIMEOUT>
|
|
||||||
Max time to download each torrent
|
|
||||||
|
|
||||||
[default: 10]
|
|
||||||
|
|
||||||
--sleep <SLEEP>
|
--sleep <SLEEP>
|
||||||
Crawl loop delay in seconds
|
Crawl loop delay in seconds
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -63,6 +63,10 @@ pub struct Argument {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
pub preload_regex: Option<String>,
|
pub preload_regex: Option<String>,
|
||||||
|
|
||||||
|
/// Stop crawler on total preload files size reached
|
||||||
|
#[arg(long)]
|
||||||
|
pub preload_total_size: Option<u64>,
|
||||||
|
|
||||||
/// Max size sum of preloaded files per torrent (match `preload_regex`)
|
/// Max size sum of preloaded files per torrent (match `preload_regex`)
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
pub preload_max_filesize: Option<u64>,
|
pub preload_max_filesize: Option<u64>,
|
||||||
|
|
|
||||||
20
src/main.rs
20
src/main.rs
|
|
@ -16,7 +16,11 @@ async fn main() -> Result<()> {
|
||||||
AddTorrent, AddTorrentOptions, AddTorrentResponse, ConnectionOptions,
|
AddTorrent, AddTorrentOptions, AddTorrentResponse, ConnectionOptions,
|
||||||
PeerConnectionOptions, SessionOptions,
|
PeerConnectionOptions, SessionOptions,
|
||||||
};
|
};
|
||||||
use std::{collections::HashSet, num::NonZero, time::Duration};
|
use std::{
|
||||||
|
collections::{HashMap, HashSet},
|
||||||
|
num::NonZero,
|
||||||
|
time::Duration,
|
||||||
|
};
|
||||||
use tokio::time;
|
use tokio::time;
|
||||||
|
|
||||||
// init components
|
// init components
|
||||||
|
|
@ -56,7 +60,7 @@ async fn main() -> Result<()> {
|
||||||
debug.info("Crawler started");
|
debug.info("Crawler started");
|
||||||
|
|
||||||
// collect processed info hashes to skip on the next iterations (for this session)
|
// collect processed info hashes to skip on the next iterations (for this session)
|
||||||
let mut index = HashSet::with_capacity(arg.index_capacity);
|
let mut index = HashMap::with_capacity(arg.index_capacity);
|
||||||
loop {
|
loop {
|
||||||
debug.info("Index queue begin...");
|
debug.info("Index queue begin...");
|
||||||
for source in &arg.infohash_file {
|
for source in &arg.infohash_file {
|
||||||
|
|
@ -67,7 +71,7 @@ async fn main() -> Result<()> {
|
||||||
Ok(infohashes) => {
|
Ok(infohashes) => {
|
||||||
for i in infohashes {
|
for i in infohashes {
|
||||||
// is already indexed?
|
// is already indexed?
|
||||||
if index.contains(&i) {
|
if index.contains_key(&i) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
debug.info(&format!("Index `{i}`..."));
|
debug.info(&format!("Index `{i}`..."));
|
||||||
|
|
@ -159,7 +163,7 @@ async fn main() -> Result<()> {
|
||||||
// cleanup irrelevant files (see rqbit#408)
|
// cleanup irrelevant files (see rqbit#408)
|
||||||
storage.cleanup(&i, Some(only_files_keep))?;
|
storage.cleanup(&i, Some(only_files_keep))?;
|
||||||
// ignore on the next crawl iterations for this session
|
// ignore on the next crawl iterations for this session
|
||||||
index.insert(i);
|
index.insert(i, only_files_size);
|
||||||
}
|
}
|
||||||
Ok(AddTorrentResponse::ListOnly(r)) => {
|
Ok(AddTorrentResponse::ListOnly(r)) => {
|
||||||
if arg.save_torrents {
|
if arg.save_torrents {
|
||||||
|
|
@ -170,7 +174,7 @@ async fn main() -> Result<()> {
|
||||||
// Manticore and other alternative storage type
|
// Manticore and other alternative storage type
|
||||||
|
|
||||||
// ignore on the next crawl iterations for this session
|
// ignore on the next crawl iterations for this session
|
||||||
index.insert(i);
|
index.insert(i, 0);
|
||||||
}
|
}
|
||||||
// unexpected as should be deleted
|
// unexpected as should be deleted
|
||||||
Ok(AddTorrentResponse::AlreadyManaged(..)) => panic!(),
|
Ok(AddTorrentResponse::AlreadyManaged(..)) => panic!(),
|
||||||
|
|
@ -183,6 +187,12 @@ async fn main() -> Result<()> {
|
||||||
Err(e) => debug.error(&format!("API issue for `{source}`: `{e}`")),
|
Err(e) => debug.error(&format!("API issue for `{source}`: `{e}`")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if arg
|
||||||
|
.preload_total_size
|
||||||
|
.is_some_and(|s| index.values().sum::<u64>() > s)
|
||||||
|
{
|
||||||
|
panic!("Preload content size {} bytes reached!", 0)
|
||||||
|
}
|
||||||
debug.info(&format!(
|
debug.info(&format!(
|
||||||
"Index completed, {} total, await {} seconds to continue...",
|
"Index completed, {} total, await {} seconds to continue...",
|
||||||
index.len(),
|
index.len(),
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue