implement clear, torrent_tracker, disable_dht arguments

This commit is contained in:
yggverse 2025-06-07 15:53:17 +03:00
parent bd6cc95bf3
commit 1da27bfffe
5 changed files with 62 additions and 15 deletions

View file

@ -1,6 +1,6 @@
[package] [package]
name = "aquatic-crawler" name = "aquatic-crawler"
version = "0.1.0" version = "0.2.0"
edition = "2024" edition = "2024"
license = "MIT" license = "MIT"
readme = "README.md" readme = "README.md"
@ -18,3 +18,4 @@ librqbit = "8.1.0"
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0" serde_json = "1.0"
tokio = { version = "1.45", features = ["full"] } tokio = { version = "1.45", features = ["full"] }
url = "2.5.4"

View file

@ -21,9 +21,6 @@ Crawler/aggregation tool for the [Aquatic](https://github.com/greatest-ape/aquat
* [ ] V2 * [ ] V2
* [ ] [Manticore](https://github.com/manticoresoftware/manticoresearch-rust) full text search * [ ] [Manticore](https://github.com/manticoresoftware/manticoresearch-rust) full text search
* [ ] SQLite * [ ] SQLite
* Tools
* [ ] Storage cleaner
* [ ] Implement tests
## Install ## Install
@ -53,12 +50,23 @@ Options:
[default: ei] [default: ei]
-i, --infohash-source <INFOHASH_SOURCE> -c, --clear
Filepath(s) to the Aquatic tracker info-hash JSON/API (PR#233) Clear previous index collected on crawl session start
-t, --torrents-path <TORRENTS_PATH> -i, --infohash-source <INFOHASH_SOURCE>
Filepath(s) to the Aquatic tracker info-hash JSON/API
* PR#233 feature
--torrents-path <TORRENTS_PATH>
Directory path to store the `.torrent` files Directory path to store the `.torrent` files
--torrent-tracker <TORRENT_TRACKER>
Define custom tracker(s) to preload the `.torrent` files info
--disable-dht
Disable DHT resolver (useful with `torrent_tracker`)
-s, --sleep <SLEEP> -s, --sleep <SLEEP>
Crawl loop delay in seconds Crawl loop delay in seconds

View file

@ -10,16 +10,28 @@ pub struct Argument {
#[arg(short, long, default_value_t = String::from("ei"))] #[arg(short, long, default_value_t = String::from("ei"))]
pub debug: String, pub debug: String,
/// Clear previous index collected on crawl session start
#[arg(short, long, default_value_t = false)]
pub clear: bool,
/// Filepath(s) to the Aquatic tracker info-hash JSON/API /// Filepath(s) to the Aquatic tracker info-hash JSON/API
/// ///
/// * PR #233 info-hash table implementation has multiple source tables for IPv4 and IPv6 /// * PR#233 feature
#[arg(short, long)] #[arg(short, long)]
pub infohash_source: Vec<String>, pub infohash_source: Vec<String>,
/// Directory path to store the `.torrent` files /// Directory path to store the `.torrent` files
#[arg(short, long)] #[arg(long)]
pub torrents_path: Option<String>, pub torrents_path: Option<String>,
/// Define custom tracker(s) to preload the `.torrent` files info
#[arg(long)]
pub torrent_tracker: Vec<String>,
/// Disable DHT resolver (useful with `torrent_tracker`)
#[arg(long, default_value_t = false)]
pub disable_dht: bool,
/// Crawl loop delay in seconds /// Crawl loop delay in seconds
#[arg(short, long, default_value_t = 300)] #[arg(short, long, default_value_t = 300)]
pub sleep: u64, pub sleep: u64,

View file

@ -4,12 +4,17 @@ use std::{fs, io::Write, path::PathBuf, str::FromStr};
pub struct Storage(PathBuf); pub struct Storage(PathBuf);
impl Storage { impl Storage {
pub fn init(storage: &str) -> Result<Self> { pub fn init(storage: &str, clear: bool) -> Result<Self> {
let p = PathBuf::from_str(storage)?; let p = PathBuf::from_str(storage)?;
if fs::metadata(&p).is_ok_and(|t| t.is_file()) { if let Ok(t) = fs::metadata(&p) {
if t.is_file() {
bail!("Target destination is not directory!") bail!("Target destination is not directory!")
} }
fs::create_dir_all(storage)?; if t.is_dir() && clear {
fs::remove_dir_all(&p)?;
}
}
fs::create_dir_all(&p)?;
Ok(Self(p)) Ok(Self(p))
} }

View file

@ -6,6 +6,9 @@ mod debug;
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
use clap::Parser; use clap::Parser;
use librqbit::SessionOptions;
use std::str::FromStr;
let argument = argument::Argument::parse(); let argument = argument::Argument::parse();
// calculate debug level once // calculate debug level once
@ -14,11 +17,21 @@ async fn main() -> anyhow::Result<()> {
// init shared members // init shared members
let torrent_storage = if let Some(t) = argument.torrents_path { let torrent_storage = if let Some(t) = argument.torrents_path {
Some(database::torrent::Storage::init(&t)?) let s = database::torrent::Storage::init(&t, argument.clear)?;
if argument.clear && is_debug_i {
debug::info(String::from("Cleanup torrent storage"));
}
Some(s)
} else { } else {
None None
}; };
let mut trackers = std::collections::HashSet::with_capacity(argument.torrent_tracker.len());
for tracker in argument.torrent_tracker {
trackers.insert(url::Url::from_str(&tracker)?);
}
// begin
if is_debug_i { if is_debug_i {
debug::info(String::from("Crawler started")); debug::info(String::from("Crawler started"));
} }
@ -28,7 +41,15 @@ async fn main() -> anyhow::Result<()> {
debug::info(String::from("New index session begin...")); debug::info(String::from("New index session begin..."));
} }
let mut total = 0; let mut total = 0;
let session = librqbit::Session::new(std::path::PathBuf::new()).await?; let session = librqbit::Session::new_with_opts(
std::path::PathBuf::new(),
SessionOptions {
disable_dht: argument.disable_dht,
trackers: trackers.clone(),
..SessionOptions::default()
},
)
.await?;
// collect info-hashes from API // collect info-hashes from API
for source in &argument.infohash_source { for source in &argument.infohash_source {
if is_debug_i { if is_debug_i {