diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..ada8a24 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +custom: https://yggverse.github.io/#donate \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..85edd8b --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,27 @@ +name: Build + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + CARGO_TERM_COLOR: always + RUSTFLAGS: -Dwarnings + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Run rustfmt + run: cargo fmt --all -- --check + - name: Run clippy + run: cargo clippy --all-targets + - name: Build + run: cargo build --verbose + - name: Run tests + run: cargo test --verbose diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..869df07 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ef7fa8d --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "aquatic-crawler" +version = "0.1.0" +edition = "2024" +license = "MIT" +readme = "README.md" +description = "Crawler tool for the Aquatic BitTorrent tracker API" +keywords = ["aquatic", "crawler", "parser", "bittorrent", "magnet"] +categories = ["network-programming"] +repository = "https://github.com/YGGverse/aquatic-crawler" +# homepage = "https://yggverse.github.io" + +[dependencies] +anyhow = "1.0.98" +clap = { version = "4.5", features = ["derive"] } +hyper-util = "0.1.14" +librqbit = "8.1.0" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +tokio = { version = "1", features = ["full"] } diff --git a/README.md b/README.md index 0746931..edc52fd 100644 --- a/README.md +++ b/README.md @@ -1 +1,72 @@ -# aquatic-crawler \ No newline at end of file +# aquatic-crawler + +![Linux](https://github.com/YGGverse/aquatic-crawler/actions/workflows/linux.yml/badge.svg) +[![Dependencies](https://deps.rs/repo/github/YGGverse/aquatic-crawler/status.svg)](https://deps.rs/repo/github/YGGverse/aquatic-crawler) +[![crates.io](https://img.shields.io/crates/v/aquatic-crawler.svg)](https://crates.io/crates/aquatic-crawler) + +Crawler/aggregation tool for the [Aquatic](https://github.com/greatest-ape/aquatic) BitTorrent tracker. + +> [!NOTE] +> Project in development! + +## Roadmap + +* Targets supported + * [x] IPv4/IPv6 info-hash JSON/API (see [PR#233](https://github.com/greatest-ape/aquatic/pull/233)) + * [x] local file path + * [ ] remote URL +* Storage + * [x] File system (dump as `.torrent`) + [x] V1 + [ ] V2 + * [ ] [Manticore](https://github.com/manticoresoftware/manticoresearch-rust) full text search + * [ ] SQLite +* Tools + * [ ] Storage cleaner + * [ ] Implement tests + +## Install + +1. `git clone https://github.com/YGGverse/aquatic-crawler.git && cd aquatic-crawler` +2. `cargo build --release` +3. `sudo install target/release/aquatic-crawler /usr/local/bin/aquatic-crawler` + +## Usage + +``` bash +aquatic-crawler --infohash-source /path/to/info-hash-ipv4.json\ + --infohash-source /path/to/info-hash-ipv6.json\ + --infohash-source /path/to/another-source.json\ + --torrents-path /path/to/storage +``` +* all arguments are optional, to support multiple source and target drivers + running without arguments does nothing! + +### Options + +``` bash +Options: + -d, --debug + Debug level + + * `e` - error * `i` - info + + [default: ei] + + -i, --infohash-source + Filepath(s) to the Aquatic tracker info-hash JSON/API (PR#233) + + -t, --torrents-path + Directory path to store the `.torrent` files + + -s, --sleep + Crawl loop delay in seconds + + [default: 300] + + -h, --help + Print help (see a summary with '-h') + + -V, --version + Print version +``` \ No newline at end of file diff --git a/src/api.rs b/src/api.rs new file mode 100644 index 0000000..b9be999 --- /dev/null +++ b/src/api.rs @@ -0,0 +1,13 @@ +/// Parse infohash from the source filepath, +/// decode JSON to array on success +pub fn infohashes(path: &str) -> anyhow::Result> { + let mut f = std::fs::File::open(path)?; + let mut s = String::new(); + + use std::io::Read; + f.read_to_string(&mut s)?; + + let r: Vec = serde_json::from_str(&s)?; + + Ok(r) +} diff --git a/src/argument.rs b/src/argument.rs new file mode 100644 index 0000000..4f2344b --- /dev/null +++ b/src/argument.rs @@ -0,0 +1,26 @@ +use clap::Parser; + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +pub struct Argument { + /// Debug level + /// + /// * `e` - error + /// * `i` - info + #[arg(short, long, default_value_t = String::from("ei"))] + pub debug: String, + + /// Filepath(s) to the Aquatic tracker info-hash JSON/API + /// + /// * PR #233 info-hash table implementation has multiple source tables for IPv4 and IPv6 + #[arg(short, long)] + pub infohash_source: Vec, + + /// Directory path to store the `.torrent` files + #[arg(short, long)] + pub torrents_path: Option, + + /// Crawl loop delay in seconds + #[arg(short, long, default_value_t = 300)] + pub sleep: u64, +} diff --git a/src/database.rs b/src/database.rs new file mode 100644 index 0000000..89c2730 --- /dev/null +++ b/src/database.rs @@ -0,0 +1 @@ +pub mod torrent; diff --git a/src/database/torrent.rs b/src/database/torrent.rs new file mode 100644 index 0000000..0be20c7 --- /dev/null +++ b/src/database/torrent.rs @@ -0,0 +1,33 @@ +use anyhow::{Result, bail}; +use std::{fs, io::Write, path::PathBuf, str::FromStr}; + +pub struct Storage(PathBuf); + +impl Storage { + pub fn init(storage: &str) -> Result { + let p = PathBuf::from_str(storage)?; + if fs::metadata(&p).is_ok_and(|t| t.is_file()) { + bail!("Target destination is not directory!") + } + fs::create_dir_all(storage)?; + Ok(Self(p)) + } + + pub fn exists(&self, infohash: &str) -> bool { + fs::metadata(self.filename(infohash)).is_ok_and(|p| p.is_file()) + } + + pub fn save(&self, infohash: &str, bytes: &[u8]) -> Result { + let p = self.filename(infohash); + let mut f = fs::File::create(&p)?; + f.write_all(bytes)?; + Ok(p) + } + + fn filename(&self, infohash: &str) -> PathBuf { + let mut p = PathBuf::new(); + p.push(&self.0); + p.push(format!("{infohash}.torrent")); + p + } +} diff --git a/src/debug.rs b/src/debug.rs new file mode 100644 index 0000000..8274dbf --- /dev/null +++ b/src/debug.rs @@ -0,0 +1,19 @@ +pub fn error(e: &anyhow::Error) { + eprintln!( + "[{}] [error] {e}", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() + ) +} + +pub fn info(message: String) { + eprintln!( + "[{}] [info] {message}", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() + ) +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..00bdf11 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,91 @@ +mod api; +mod argument; +mod database; +mod debug; + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + use clap::Parser; + let argument = argument::Argument::parse(); + + // calculate debug level once + let is_debug_i = argument.debug.contains("i"); + let is_debug_e = argument.debug.contains("e"); + + // init shared members + let torrent_storage = if let Some(t) = argument.torrents_path { + Some(database::torrent::Storage::init(&t)?) + } else { + None + }; + + if is_debug_i { + debug::info(String::from("Crawler started")); + } + + loop { + if is_debug_i { + debug::info(String::from("New index session begin...")); + } + let mut total = 0; + let session = librqbit::Session::new(std::path::PathBuf::new()).await?; + // collect info-hashes from API + for source in &argument.infohash_source { + if is_debug_i { + debug::info(format!("Handle info-hash source `{source}`...")); + } + + // aquatic server may update the stats at this moment, + // handle this state manually + match api::infohashes(source) { + Ok(infohashes) => { + total += infohashes.len(); + for i in infohashes { + if torrent_storage.as_ref().is_some_and(|s| !s.exists(&i)) { + match session + .add_torrent( + librqbit::AddTorrent::from_url(format!( + "magnet:?xt=urn:btih:{i}" + )), + Some(librqbit::AddTorrentOptions { + list_only: true, + ..Default::default() + }), + ) + .await? + { + librqbit::AddTorrentResponse::ListOnly(r) => { + if let Some(ref s) = torrent_storage { + let p = s.save(&i, &r.torrent_bytes)?; + if is_debug_i { + debug::info(format!( + "Add new torrent file `{}`", + p.to_string_lossy() + )); + } + } + // @TODO + // use `r.info` for Memory, SQLite, Manticore and other alternative storage type + } + _ => panic!(), + } + } + } + } + Err(ref e) => { + if is_debug_e { + debug::error(e) + } + } + } + } + session.stop().await; + if is_debug_i { + debug::info(format!( + "Index of {total} hashes completed, await {} seconds to continue...", + argument.sleep, + )); + } + std::thread::sleep(std::time::Duration::from_secs(argument.sleep)); + } +}