diff --git a/Cargo.toml b/Cargo.toml index 1d7e1d4..c019c17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aquatic-crawler" -version = "0.1.0" +version = "0.2.0" edition = "2024" license = "MIT" readme = "README.md" @@ -18,3 +18,4 @@ librqbit = "8.1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" tokio = { version = "1.45", features = ["full"] } +url = "2.5.4" diff --git a/README.md b/README.md index 7cc097d..742a9e0 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,6 @@ Crawler/aggregation tool for the [Aquatic](https://github.com/greatest-ape/aquat * [ ] V2 * [ ] [Manticore](https://github.com/manticoresoftware/manticoresearch-rust) full text search * [ ] SQLite -* Tools - * [ ] Storage cleaner - * [ ] Implement tests ## Install @@ -53,12 +50,23 @@ Options: [default: ei] - -i, --infohash-source - Filepath(s) to the Aquatic tracker info-hash JSON/API (PR#233) + -c, --clear + Clear previous index collected on crawl session start - -t, --torrents-path + -i, --infohash-source + Filepath(s) to the Aquatic tracker info-hash JSON/API + + * PR#233 feature + + --torrents-path Directory path to store the `.torrent` files + --torrent-tracker + Define custom tracker(s) to preload the `.torrent` files info + + --disable-dht + Disable DHT resolver (useful with `torrent_tracker`) + -s, --sleep Crawl loop delay in seconds diff --git a/src/argument.rs b/src/argument.rs index 4f2344b..723fab3 100644 --- a/src/argument.rs +++ b/src/argument.rs @@ -10,16 +10,28 @@ pub struct Argument { #[arg(short, long, default_value_t = String::from("ei"))] pub debug: String, + /// Clear previous index collected on crawl session start + #[arg(short, long, default_value_t = false)] + pub clear: bool, + /// Filepath(s) to the Aquatic tracker info-hash JSON/API /// - /// * PR #233 info-hash table implementation has multiple source tables for IPv4 and IPv6 + /// * PR#233 feature #[arg(short, long)] pub infohash_source: Vec, /// Directory path to store the `.torrent` files - #[arg(short, long)] + #[arg(long)] pub torrents_path: Option, + /// Define custom tracker(s) to preload the `.torrent` files info + #[arg(long)] + pub torrent_tracker: Vec, + + /// Disable DHT resolver (useful with `torrent_tracker`) + #[arg(long, default_value_t = false)] + pub disable_dht: bool, + /// Crawl loop delay in seconds #[arg(short, long, default_value_t = 300)] pub sleep: u64, diff --git a/src/database/torrent.rs b/src/database/torrent.rs index 0be20c7..984ee56 100644 --- a/src/database/torrent.rs +++ b/src/database/torrent.rs @@ -4,12 +4,17 @@ use std::{fs, io::Write, path::PathBuf, str::FromStr}; pub struct Storage(PathBuf); impl Storage { - pub fn init(storage: &str) -> Result { + pub fn init(storage: &str, clear: bool) -> Result { let p = PathBuf::from_str(storage)?; - if fs::metadata(&p).is_ok_and(|t| t.is_file()) { - bail!("Target destination is not directory!") + if let Ok(t) = fs::metadata(&p) { + if t.is_file() { + bail!("Target destination is not directory!") + } + if t.is_dir() && clear { + fs::remove_dir_all(&p)?; + } } - fs::create_dir_all(storage)?; + fs::create_dir_all(&p)?; Ok(Self(p)) } diff --git a/src/main.rs b/src/main.rs index 00bdf11..7cede70 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,6 +6,9 @@ mod debug; #[tokio::main] async fn main() -> anyhow::Result<()> { use clap::Parser; + use librqbit::SessionOptions; + use std::str::FromStr; + let argument = argument::Argument::parse(); // calculate debug level once @@ -14,11 +17,21 @@ async fn main() -> anyhow::Result<()> { // init shared members let torrent_storage = if let Some(t) = argument.torrents_path { - Some(database::torrent::Storage::init(&t)?) + let s = database::torrent::Storage::init(&t, argument.clear)?; + if argument.clear && is_debug_i { + debug::info(String::from("Cleanup torrent storage")); + } + Some(s) } else { None }; + let mut trackers = std::collections::HashSet::with_capacity(argument.torrent_tracker.len()); + for tracker in argument.torrent_tracker { + trackers.insert(url::Url::from_str(&tracker)?); + } + + // begin if is_debug_i { debug::info(String::from("Crawler started")); } @@ -28,7 +41,15 @@ async fn main() -> anyhow::Result<()> { debug::info(String::from("New index session begin...")); } let mut total = 0; - let session = librqbit::Session::new(std::path::PathBuf::new()).await?; + let session = librqbit::Session::new_with_opts( + std::path::PathBuf::new(), + SessionOptions { + disable_dht: argument.disable_dht, + trackers: trackers.clone(), + ..SessionOptions::default() + }, + ) + .await?; // collect info-hashes from API for source in &argument.infohash_source { if is_debug_i {