mirror of
https://github.com/YGGverse/aquatic-crawler.git
synced 2026-03-31 09:05:33 +00:00
220 lines
11 KiB
Rust
220 lines
11 KiB
Rust
mod api;
|
|
mod argument;
|
|
mod debug;
|
|
mod peers;
|
|
mod storage;
|
|
mod trackers;
|
|
|
|
use anyhow::Result;
|
|
use debug::Debug;
|
|
use storage::Storage;
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<()> {
|
|
use clap::Parser;
|
|
use librqbit::{
|
|
AddTorrent, AddTorrentOptions, AddTorrentResponse, ConnectionOptions,
|
|
PeerConnectionOptions, SessionOptions,
|
|
};
|
|
use std::{collections::HashSet, num::NonZero, time::Duration};
|
|
use tokio::time;
|
|
|
|
// init components
|
|
let arg = argument::Argument::parse();
|
|
let debug = Debug::init(&arg.debug)?;
|
|
let peers = peers::Peers::init(&arg.initial_peer)?;
|
|
let storage = Storage::init(&arg.storage, arg.clear)?;
|
|
let trackers = trackers::Trackers::init(&arg.torrent_tracker)?;
|
|
let preload_regex = arg.preload_regex.map(|ref r| regex::Regex::new(r).unwrap());
|
|
let session = librqbit::Session::new_with_opts(
|
|
storage.path(),
|
|
SessionOptions {
|
|
connect: Some(ConnectionOptions {
|
|
enable_tcp: arg.enable_tcp,
|
|
proxy_url: arg.proxy_url,
|
|
peer_opts: Some(PeerConnectionOptions {
|
|
connect_timeout: arg.peer_connect_timeout.map(Duration::from_secs),
|
|
read_write_timeout: arg.peer_read_write_timeout.map(Duration::from_secs),
|
|
keep_alive_interval: arg.peer_keep_alive_interval.map(Duration::from_secs),
|
|
}),
|
|
}),
|
|
disable_upload: !arg.enable_upload,
|
|
disable_dht: !arg.enable_dht,
|
|
disable_dht_persistence: true,
|
|
persistence: None,
|
|
ratelimits: librqbit::limits::LimitsConfig {
|
|
upload_bps: arg.upload_limit.and_then(NonZero::new),
|
|
download_bps: arg.download_limit.and_then(NonZero::new),
|
|
},
|
|
trackers: trackers.clone(),
|
|
..SessionOptions::default()
|
|
},
|
|
)
|
|
.await?;
|
|
|
|
// begin
|
|
debug.info("Crawler started");
|
|
|
|
// collect processed info hashes to skip on the next iterations (for this session)
|
|
let mut index = HashSet::with_capacity(arg.index_capacity);
|
|
loop {
|
|
debug.info("Index queue begin...");
|
|
for source in &arg.infohash_file {
|
|
debug.info(&format!("Index source `{source}`..."));
|
|
// grab latest info-hashes from this source
|
|
// * aquatic server may update the stats at this moment, handle result manually
|
|
match api::infohashes(source) {
|
|
Ok(infohashes) => {
|
|
for i in infohashes {
|
|
// is already indexed?
|
|
if index.contains(&i) {
|
|
continue;
|
|
}
|
|
debug.info(&format!("Index `{i}`..."));
|
|
// run the crawler in single thread for performance reasons,
|
|
// use `timeout` argument option to skip the dead connections.
|
|
match time::timeout(
|
|
Duration::from_secs(arg.add_torrent_timeout),
|
|
session.add_torrent(
|
|
AddTorrent::from_url(format!("magnet:?xt=urn:btih:{i}")),
|
|
Some(AddTorrentOptions {
|
|
paused: true, // continue after `only_files` init
|
|
overwrite: true,
|
|
disable_trackers: trackers.is_empty(),
|
|
initial_peers: peers.initial_peers(),
|
|
list_only: preload_regex.is_none(),
|
|
// it is important to blacklist all files preload until initiation
|
|
only_files: Some(Vec::with_capacity(
|
|
arg.preload_max_filecount.unwrap_or_default(),
|
|
)),
|
|
// the destination folder to preload files match `only_files_regex`
|
|
// * e.g. images for audio albums
|
|
output_folder: storage.output_folder(&i, true).ok(),
|
|
..Default::default()
|
|
}),
|
|
),
|
|
)
|
|
.await
|
|
{
|
|
Ok(r) => match r {
|
|
// on `preload_regex` case only
|
|
Ok(AddTorrentResponse::Added(id, mt)) => {
|
|
let mut only_files_size = 0;
|
|
let mut only_files_keep = Vec::with_capacity(
|
|
arg.preload_max_filecount.unwrap_or_default(),
|
|
);
|
|
let mut only_files = HashSet::with_capacity(
|
|
arg.preload_max_filecount.unwrap_or_default(),
|
|
);
|
|
mt.wait_until_initialized().await?;
|
|
mt.with_metadata(|m| {
|
|
// init preload files list
|
|
if let Some(ref regex) = preload_regex {
|
|
for (id, info) in m.file_infos.iter().enumerate() {
|
|
if regex.is_match(
|
|
info.relative_filename.to_str().unwrap(),
|
|
) {
|
|
if arg.preload_max_filesize.is_some_and(
|
|
|limit| only_files_size + info.len > limit,
|
|
) {
|
|
debug.info(&format!(
|
|
"Total files size limit `{i}` reached!"
|
|
));
|
|
break;
|
|
}
|
|
if arg.preload_max_filecount.is_some_and(
|
|
|limit| only_files.len() + 1 > limit,
|
|
) {
|
|
debug.info(&format!(
|
|
"Total files count limit for `{i}` reached!"
|
|
));
|
|
break;
|
|
}
|
|
only_files_size += info.len;
|
|
only_files_keep.push(storage.absolute(&i, &info.relative_filename));
|
|
only_files.insert(id);
|
|
}
|
|
}
|
|
}
|
|
// dump info-hash to the torrent file
|
|
if arg.save_torrents {
|
|
save_torrent_file(
|
|
&storage,
|
|
&debug,
|
|
&i,
|
|
&m.torrent_bytes,
|
|
)
|
|
}
|
|
// @TODO
|
|
// use `r.info` for Memory, SQLite, Manticore and other alternative storage type
|
|
})?;
|
|
session.update_only_files(&mt, &only_files).await?;
|
|
session.unpause(&mt).await?;
|
|
// await for `preload_regex` files download to continue
|
|
match time::timeout(
|
|
Duration::from_secs(arg.download_torrent_timeout),
|
|
mt.wait_until_completed(),
|
|
)
|
|
.await
|
|
{
|
|
Ok(r) => {
|
|
if let Err(e) = r {
|
|
debug.info(&format!("Skip `{i}`: `{e}`."))
|
|
} else {
|
|
// remove torrent from session as indexed
|
|
session
|
|
.delete(
|
|
librqbit::api::TorrentIdOrHash::Id(id),
|
|
false,
|
|
)
|
|
.await?;
|
|
// cleanup irrelevant files (see rqbit#408)
|
|
storage.cleanup(&i, Some(only_files_keep))?;
|
|
// ignore on the next crawl iterations for this session
|
|
index.insert(i);
|
|
}
|
|
}
|
|
Err(e) => debug.info(&format!("Skip `{i}`: `{e}`.")),
|
|
}
|
|
}
|
|
Ok(AddTorrentResponse::ListOnly(r)) => {
|
|
if arg.save_torrents {
|
|
save_torrent_file(&storage, &debug, &i, &r.torrent_bytes)
|
|
}
|
|
// @TODO
|
|
// use `r.info` for Memory, SQLite,
|
|
// Manticore and other alternative storage type
|
|
|
|
// ignore on the next crawl iterations for this session
|
|
index.insert(i);
|
|
}
|
|
// unexpected as should be deleted
|
|
Ok(AddTorrentResponse::AlreadyManaged(..)) => panic!(),
|
|
Err(e) => debug.info(&format!("Skip `{i}`: `{e}`.")),
|
|
},
|
|
Err(e) => debug.info(&format!("Skip `{i}`: `{e}`.")),
|
|
}
|
|
}
|
|
}
|
|
Err(e) => debug.error(&format!("API issue for `{source}`: `{e}`")),
|
|
}
|
|
}
|
|
debug.info(&format!(
|
|
"Index completed, {} total, await {} seconds to continue...",
|
|
index.len(),
|
|
arg.sleep,
|
|
));
|
|
std::thread::sleep(Duration::from_secs(arg.sleep));
|
|
}
|
|
}
|
|
|
|
fn save_torrent_file(s: &Storage, d: &Debug, i: &str, b: &[u8]) {
|
|
if s.torrent_exists(i) {
|
|
d.info(&format!("Torrent file `{i}` already exists, skip"))
|
|
} else {
|
|
match s.save_torrent(i, b) {
|
|
Ok(r) => d.info(&format!("Add torrent file `{}`", r.to_string_lossy())),
|
|
Err(e) => d.error(&e.to_string()),
|
|
}
|
|
}
|
|
}
|