initial commit

This commit is contained in:
yggverse 2025-06-07 12:28:02 +03:00
parent 9fcd892d42
commit d55c642eb6
11 changed files with 305 additions and 1 deletions

1
.github/FUNDING.yml vendored Normal file
View file

@ -0,0 +1 @@
custom: https://yggverse.github.io/#donate

27
.github/workflows/build.yml vendored Normal file
View file

@ -0,0 +1,27 @@
name: Build
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: -Dwarnings
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run rustfmt
run: cargo fmt --all -- --check
- name: Run clippy
run: cargo clippy --all-targets
- name: Build
run: cargo build --verbose
- name: Run tests
run: cargo test --verbose

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target
Cargo.lock

20
Cargo.toml Normal file
View file

@ -0,0 +1,20 @@
[package]
name = "aquatic-crawler"
version = "0.1.0"
edition = "2024"
license = "MIT"
readme = "README.md"
description = "Crawler tool for the Aquatic BitTorrent tracker API"
keywords = ["aquatic", "crawler", "parser", "bittorrent", "magnet"]
categories = ["network-programming"]
repository = "https://github.com/YGGverse/aquatic-crawler"
# homepage = "https://yggverse.github.io"
[dependencies]
anyhow = "1.0.98"
clap = { version = "4.5", features = ["derive"] }
hyper-util = "0.1.14"
librqbit = "8.1.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
tokio = { version = "1", features = ["full"] }

View file

@ -1 +1,72 @@
# aquatic-crawler
![Linux](https://github.com/YGGverse/aquatic-crawler/actions/workflows/linux.yml/badge.svg)
[![Dependencies](https://deps.rs/repo/github/YGGverse/aquatic-crawler/status.svg)](https://deps.rs/repo/github/YGGverse/aquatic-crawler)
[![crates.io](https://img.shields.io/crates/v/aquatic-crawler.svg)](https://crates.io/crates/aquatic-crawler)
Crawler/aggregation tool for the [Aquatic](https://github.com/greatest-ape/aquatic) BitTorrent tracker.
> [!NOTE]
> Project in development!
## Roadmap
* Targets supported
* [x] IPv4/IPv6 info-hash JSON/API (see [PR#233](https://github.com/greatest-ape/aquatic/pull/233))
* [x] local file path
* [ ] remote URL
* Storage
* [x] File system (dump as `.torrent`)
[x] V1
[ ] V2
* [ ] [Manticore](https://github.com/manticoresoftware/manticoresearch-rust) full text search
* [ ] SQLite
* Tools
* [ ] Storage cleaner
* [ ] Implement tests
## Install
1. `git clone https://github.com/YGGverse/aquatic-crawler.git && cd aquatic-crawler`
2. `cargo build --release`
3. `sudo install target/release/aquatic-crawler /usr/local/bin/aquatic-crawler`
## Usage
``` bash
aquatic-crawler --infohash-source /path/to/info-hash-ipv4.json\
--infohash-source /path/to/info-hash-ipv6.json\
--infohash-source /path/to/another-source.json\
--torrents-path /path/to/storage
```
* all arguments are optional, to support multiple source and target drivers
running without arguments does nothing!
### Options
``` bash
Options:
-d, --debug <DEBUG>
Debug level
* `e` - error * `i` - info
[default: ei]
-i, --infohash-source <INFOHASH_SOURCE>
Filepath(s) to the Aquatic tracker info-hash JSON/API (PR#233)
-t, --torrents-path <TORRENTS_PATH>
Directory path to store the `.torrent` files
-s, --sleep <SLEEP>
Crawl loop delay in seconds
[default: 300]
-h, --help
Print help (see a summary with '-h')
-V, --version
Print version
```

13
src/api.rs Normal file
View file

@ -0,0 +1,13 @@
/// Parse infohash from the source filepath,
/// decode JSON to array on success
pub fn infohashes(path: &str) -> anyhow::Result<Vec<String>> {
let mut f = std::fs::File::open(path)?;
let mut s = String::new();
use std::io::Read;
f.read_to_string(&mut s)?;
let r: Vec<String> = serde_json::from_str(&s)?;
Ok(r)
}

26
src/argument.rs Normal file
View file

@ -0,0 +1,26 @@
use clap::Parser;
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
pub struct Argument {
/// Debug level
///
/// * `e` - error
/// * `i` - info
#[arg(short, long, default_value_t = String::from("ei"))]
pub debug: String,
/// Filepath(s) to the Aquatic tracker info-hash JSON/API
///
/// * PR #233 info-hash table implementation has multiple source tables for IPv4 and IPv6
#[arg(short, long)]
pub infohash_source: Vec<String>,
/// Directory path to store the `.torrent` files
#[arg(short, long)]
pub torrents_path: Option<String>,
/// Crawl loop delay in seconds
#[arg(short, long, default_value_t = 300)]
pub sleep: u64,
}

1
src/database.rs Normal file
View file

@ -0,0 +1 @@
pub mod torrent;

33
src/database/torrent.rs Normal file
View file

@ -0,0 +1,33 @@
use anyhow::{Result, bail};
use std::{fs, io::Write, path::PathBuf, str::FromStr};
pub struct Storage(PathBuf);
impl Storage {
pub fn init(storage: &str) -> Result<Self> {
let p = PathBuf::from_str(storage)?;
if fs::metadata(&p).is_ok_and(|t| t.is_file()) {
bail!("Target destination is not directory!")
}
fs::create_dir_all(storage)?;
Ok(Self(p))
}
pub fn exists(&self, infohash: &str) -> bool {
fs::metadata(self.filename(infohash)).is_ok_and(|p| p.is_file())
}
pub fn save(&self, infohash: &str, bytes: &[u8]) -> Result<PathBuf> {
let p = self.filename(infohash);
let mut f = fs::File::create(&p)?;
f.write_all(bytes)?;
Ok(p)
}
fn filename(&self, infohash: &str) -> PathBuf {
let mut p = PathBuf::new();
p.push(&self.0);
p.push(format!("{infohash}.torrent"));
p
}
}

19
src/debug.rs Normal file
View file

@ -0,0 +1,19 @@
pub fn error(e: &anyhow::Error) {
eprintln!(
"[{}] [error] {e}",
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis()
)
}
pub fn info(message: String) {
eprintln!(
"[{}] [info] {message}",
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis()
)
}

91
src/main.rs Normal file
View file

@ -0,0 +1,91 @@
mod api;
mod argument;
mod database;
mod debug;
#[tokio::main]
async fn main() -> anyhow::Result<()> {
use clap::Parser;
let argument = argument::Argument::parse();
// calculate debug level once
let is_debug_i = argument.debug.contains("i");
let is_debug_e = argument.debug.contains("e");
// init shared members
let torrent_storage = if let Some(t) = argument.torrents_path {
Some(database::torrent::Storage::init(&t)?)
} else {
None
};
if is_debug_i {
debug::info(String::from("Crawler started"));
}
loop {
if is_debug_i {
debug::info(String::from("New index session begin..."));
}
let mut total = 0;
let session = librqbit::Session::new(std::path::PathBuf::new()).await?;
// collect info-hashes from API
for source in &argument.infohash_source {
if is_debug_i {
debug::info(format!("Handle info-hash source `{source}`..."));
}
// aquatic server may update the stats at this moment,
// handle this state manually
match api::infohashes(source) {
Ok(infohashes) => {
total += infohashes.len();
for i in infohashes {
if torrent_storage.as_ref().is_some_and(|s| !s.exists(&i)) {
match session
.add_torrent(
librqbit::AddTorrent::from_url(format!(
"magnet:?xt=urn:btih:{i}"
)),
Some(librqbit::AddTorrentOptions {
list_only: true,
..Default::default()
}),
)
.await?
{
librqbit::AddTorrentResponse::ListOnly(r) => {
if let Some(ref s) = torrent_storage {
let p = s.save(&i, &r.torrent_bytes)?;
if is_debug_i {
debug::info(format!(
"Add new torrent file `{}`",
p.to_string_lossy()
));
}
}
// @TODO
// use `r.info` for Memory, SQLite, Manticore and other alternative storage type
}
_ => panic!(),
}
}
}
}
Err(ref e) => {
if is_debug_e {
debug::error(e)
}
}
}
}
session.stop().await;
if is_debug_i {
debug::info(format!(
"Index of {total} hashes completed, await {} seconds to continue...",
argument.sleep,
));
}
std::thread::sleep(std::time::Duration::from_secs(argument.sleep));
}
}