initial commit

This commit is contained in:
yggverse 2024-06-24 23:00:45 +03:00
parent 13673da85c
commit 0b636de67c
5 changed files with 832 additions and 0 deletions

392
src/Controller/Cli.php Executable file
View file

@ -0,0 +1,392 @@
<?php
declare(strict_types=1);
namespace Yggverse\GeminiDL\Controller;
use \Yggverse\GeminiDL\Model\Cli\Message;
use \Yggverse\GeminiDL\Model\Cli\Option;
use \Yggverse\GeminiDL\Model\Filesystem;
use \Yggverse\Gemini\Client\Request;
use \Yggverse\Gemini\Client\Response;
use \Yggverse\Gemtext\Document;
use \Yggverse\Net\Address;
class Cli
{
// Init totals
public int $skip = 0;
public int $fail = 0;
public int $save = 0;
public int $size = 0;
public float $time = 0;
// Pool for crawler queue
public array $source = [];
// Define model helpers
public \Yggverse\GeminiDL\Model\Filesystem $filesystem;
public \Yggverse\GeminiDL\Model\Cli\Option $option;
// Init CLI object using options preset
public function __construct(
array $config // getopt
) {
// Init options
$this->option = new Option(
$config
);
// Init local filesystem location
$this->filesystem = new Filesystem(
$this->option->target,
$this->option->unique ? time() : null // snap version
);
// Append source address to crawler queue
$this->addSource(
$this->option->source
);
}
// Appends valid address to crawler queue
public function addSource(
string $url
): bool
{
// Validate given value and check it is unique in pool
if ($this->_source($url) && !in_array($url, $this->source))
{
$this->source[] = $url;
return true;
}
return false;
}
// Begin crawler task
public function start(
int $offset = 0
): void
{
// Apply delay to prevent source overload
if ($offset)
{
sleep(
$this->option->delay
);
}
// Check for crawl queue completed
if (!isset($this->source[$offset]))
{
print(
$this->_summary()
);
return; // stop
}
// Dump source address
print(
Message::blue(
$this->source[$offset],
true
)
);
// Parse source address
$source = new Address(
$this->source[$offset]
);
// Build request
$request = new Request(
$source->get()
);
// Track request time
$time = microtime(true);
// Parse response
$response = new Response(
$bin = $request->getResponse()
);
// Calculate response time
$this->time += $time = microtime(true) - $time; // @TODO to API
// Check response code success
if (20 === $response->getCode())
{
print(
Message::magenta(
sprintf(
_("\tcode: %d"),
$response->getCode()
)
)
);
}
else
{
print(
Message::red(
sprintf(
_("\tcode: %d"),
intval(
$response->getCode()
)
)
)
);
// Crawl next address...
if ($this->option->crawl)
{
$this->start(
$offset + 1
);
}
}
// Calculate document size
$this->size += $size = (int) strlen($bin);
print(
Message::magenta(
sprintf(
_("\tsize: %d"),
$size
)
)
);
// Get meta headers info
if ($response->getMeta())
{
print(
Message::magenta(
sprintf(
_("\tmeta: %s"),
$response->getMeta()
)
)
);
}
print(
Message::magenta(
sprintf(
_("\ttime: %f -d %f"),
$time,
$this->option->delay + $time
)
)
);
// Set downloader mode
$raw = ($this->option->raw || !str_contains((string) $response->getMeta(), 'text/gemini'));
// Parse gemtext
if (!$raw && $this->option->crawl)
{
// Reset skipped links
$skip = 0;
// Parse gemtext content
$document = new Document(
$response->getBody()
);
// Get link entities
foreach ($document->getLinks() as $link)
{
// Build new address
$address = new Address(
$link->getAddress()
);
// Make relative links absolute
$address->toAbsolute(
$source
);
// Check link match common source rules, @TODO --external links
if (!$this->_source($address->get()) || $address->getHost() != $source->getHost())
{
$skip++;
$this->skip += $skip;
continue;
}
// Address --keep not requested
if (!$this->option->keep)
{
// Replace link to local path
$link->setAddress(
$this->filesystem->getFilenameFromNetAddress(
$address,
$this->option->index,
)
);
}
// Append new address to crawler pool
$this->addSource(
$address->get()
);
}
}
// Build document filesystem location
$filename = $this->filesystem->getFilenameFromNetAddress(
$source,
$this->option->index
);
// Save document to file
$result = $this->filesystem->save(
$filename,
$raw || empty($document) ? $response->getBody()
: $document->toString()
);
// Debug FS
if ($result)
{
print(
Message::green(
_("\tsave: ") . $filename
)
);
$this->save++;
}
else
{
print(
Message::red(
_("\tfail: ") . $filename
)
);
$this->fail++;
}
// Crawl mode enabled
if ($this->option->crawl)
{
// Crawl next
$this->start(
$offset + 1
);
}
}
public static function exception(
string $message,
?string $help = null
): void
{
print(
Message::red(
$message
)
);
if ($help)
{
print(
Message::plain(
$help
)
);
}
}
// Local helpers
private function _source(
string $value
): bool
{
// Supported Gemini protocol links only
if(!str_starts_with($value, 'gemini://'))
{
return false;
}
// Make sure link --match option
if (!preg_match($this->option->match, $value))
{
return false;
}
return true;
}
private function _summary(): string
{
return implode(
PHP_EOL,
[
PHP_EOL,
Message::blue(
_('----------------')
),
Message::blue(
_('crawl completed!'),
true
),
Message::magenta(
sprintf(
_("\tdocs: %d"),
count(
$this->source
)
)
),
Message::magenta(
sprintf(
_("\tsave: %d"),
$this->save
)
),
Message::magenta(
sprintf(
_("\tskip: %d"),
$this->skip
)
),
Message::magenta(
sprintf(
_("\tfail: %d"),
$this->fail
)
),
Message::magenta(
sprintf(
_("\tsize: %d"),
$this->size
)
),
Message::magenta(
sprintf(
_("\ttime: %f -d %f"),
$this->time,
$this->option->delay * count(
$this->source
) + $this->time
)
),
PHP_EOL
]
);
}
}

60
src/Model/Cli/Message.php Executable file
View file

@ -0,0 +1,60 @@
<?php
declare(strict_types=1);
namespace Yggverse\GeminiDL\Model\Cli;
use \Codedungeon\PHPCliColors\Color;
class Message
{
public static function red(
string $message
): string
{
return self::plain(
$message,
Color::RED
);
}
public static function magenta(
string $message
): string
{
return self::plain(
$message,
Color::MAGENTA
);
}
public static function blue(
string $message,
bool $bold = false
): string
{
return self::plain(
$message,
$bold ? Color::LIGHT_BLUE
: Color::BLUE
);
}
public static function green(
string $message
): string
{
return self::plain(
$message,
Color::GREEN
);
}
public static function plain(
string $message,
string $style = null
): string
{
return Color::RESET . $style . $message . PHP_EOL;
}
}

152
src/Model/Cli/Option.php Executable file
View file

@ -0,0 +1,152 @@
<?php
declare(strict_types=1);
namespace Yggverse\GeminiDL\Model\Cli;
class Option
{
public bool $crawl = false;
public int $delay = 1;
public bool $external = false;
public int $follow = 0;
public bool $help = false;
public string $index = 'index.gmi';
public bool $keep = false;
public int $level = 0;
public string $match = '/.*/';
public bool $raw = false;
public string $source;
public string $target;
public bool $unique = false;
public function __construct(
array $options
)
{
if (empty($options))
{
throw new \Exception(
_('Options required, run --help')
);
}
// Define variables
$this->crawl = boolval(
isset($options['crawl']) || isset($options['c']) || $this->crawl
);
$this->delay = intval(
$options['delay'] ?? $options['d'] ?? $this->delay
);
$this->external = boolval(
isset($options['external']) || isset($options['e']) || $this->external
);
$this->follow = intval(
$options['follow'] ?? $options['f'] ?? $this->follow
);
$this->help = boolval(
isset($options['help']) || isset($options['h']) || $this->help
);
$this->index = strval(
$options['index'] ?? $options['i'] ?? $this->index
);
$this->keep = boolval(
isset($options['keep']) || isset($options['k']) || $this->keep
);
$this->level = intval(
$options['level'] ?? $options['l'] ?? $this->level
);
$this->match = strval(
$options['match'] ?? $options['m'] ?? $this->match
);
$this->raw = boolval(
isset($options['raw']) || isset($options['r']) || $this->raw
);
$this->source = strval(
$options['source'] ?? $options['s'] ?? null
);
$this->target = strval(
$options['target'] ?? $options['t'] ?? null
);
$this->unique = boolval(
isset($options['unique']) || isset($options['u']) || $this->unique
);
// Throw help @TODO
if ($this->help)
{
throw new \Exception;
}
// Validate source
if (empty($this->source))
{
throw new \Exception(
_('--source argument required!')
);
}
if(!str_starts_with($this->source, 'gemini://'))
{
throw new \Exception(
_('--source protocol not supported!')
);
}
if (!preg_match($this->match, $this->source))
{
throw new \Exception(
_('--source does not --match condition!')
);
}
// Validate target
if (empty($this->target))
{
throw new \Exception(
_('--target argument required!')
);
}
if (!is_dir($this->target))
{
throw new \Exception(
_('--target location not exists!')
);
}
if (!is_readable($this->target))
{
throw new \Exception(
_('--target location not readable!')
);
}
if (!is_writable($this->target))
{
throw new \Exception(
_('--target location not writable!')
);
}
// Validate index
if (!$extension = pathinfo($this->index, PATHINFO_EXTENSION))
{
throw new \Exception(
_('--index filename must have extension!')
);
}
}
}

177
src/Model/Filesystem.php Executable file
View file

@ -0,0 +1,177 @@
<?php
declare(strict_types=1);
namespace Yggverse\GeminiDL\Model;
class Filesystem
{
private string $_filepath;
public function __construct(
string $directory,
?int $version
) {
switch (true)
{
case empty($directory):
throw new \Exception(
_('Directory required')
);
break;
case !is_dir($directory):
throw new \Exception(
_('Directory does not exist')
);
break;
case !is_readable($directory):
throw new \Exception(
_('Directory not readable')
);
break;
case !is_writable($directory):
throw new \Exception(
_('Directory not writable')
);
break;
default:
$this->_filepath = realpath(
$directory
) . DIRECTORY_SEPARATOR;
if ($version)
{
$this->_filepath .= $version . DIRECTORY_SEPARATOR;
}
}
}
public function getFilepath(): string
{
return $this->_filepath;
}
public function getFilenameFromNetAddress(
\Yggverse\Net\Address $address,
?string $index = null
): ?string
{
switch (true)
{
case empty($address->get()):
throw new \Exception(
_('Incorrect target address')
);
return null;
break;
case empty($address->getScheme()):
throw new \Exception(
_('Scheme required for target address')
);
return null;
break;
}
$filename = $this->_filepath . str_replace(
[
$address->getScheme() . '://',
'/'
],
[
null,
DIRECTORY_SEPARATOR
],
$address->get()
);
if ($index && (str_ends_with($filename, '/') || !pathinfo($filename, PATHINFO_EXTENSION) || basename($filename) == $address->getHost()))
{
$filename = rtrim(
$filename,
DIRECTORY_SEPARATOR
) . DIRECTORY_SEPARATOR . $index;
}
if (is_dir($filename))
{
throw new \Exception(
_('Target filename linked to directory')
);
}
return $filename;
}
public function save(
string $filename,
string $data
): bool
{
if (!str_starts_with($filename, $this->_filepath))
{
throw new \Exception(
_('Target filename out of storage location')
);
}
$filepath = str_replace(
basename(
$filename
),
'',
$filename
);
@mkdir(
$filepath,
0777, // @TODO be careful with leading zero
true
);
if (!is_dir($filepath))
{
throw new \Exception(
_('Could not create target directory')
);
}
if (!is_writable($filepath))
{
throw new \Exception(
_('Target directory is not readable')
);
}
if (!is_writable($filepath))
{
throw new \Exception(
_('Target directory is not writable')
);
}
return (bool) file_put_contents(
$filename,
$data
);
}
}

51
src/gemini-dl.php Executable file
View file

@ -0,0 +1,51 @@
#!/usr/bin/env php
<?php
// Load dependencies
require_once __DIR__ .
DIRECTORY_SEPARATOR . '..'.
DIRECTORY_SEPARATOR . 'vendor' .
DIRECTORY_SEPARATOR . 'autoload.php';
use \Yggverse\GeminiDL\Controller\Cli;
try
{
// Start application
$cli = new Cli(
getopt(
'cd:ef:hi:kl:m:rs:t:u',
[
'crawl:',
'delay:',
'external',
'follow:',
'help',
'index:',
'keep',
'level:',
'match:',
'raw',
'source:',
'target:',
'unique'
]
)
);
$cli->start();
}
// Something went wrong
catch (\Exception $data)
{
Cli::exception(
$data->getMessage(),
file_get_contents(
__DIR__ .
DIRECTORY_SEPARATOR . '..' .
DIRECTORY_SEPARATOR . 'help.gmi'
)
);
}