diff --git a/src/Controller/Cli.php b/src/Controller/Cli.php new file mode 100755 index 0000000..b97f3a1 --- /dev/null +++ b/src/Controller/Cli.php @@ -0,0 +1,392 @@ +option = new Option( + $config + ); + + // Init local filesystem location + $this->filesystem = new Filesystem( + $this->option->target, + $this->option->unique ? time() : null // snap version + ); + + // Append source address to crawler queue + $this->addSource( + $this->option->source + ); + } + + // Appends valid address to crawler queue + public function addSource( + string $url + ): bool + { + // Validate given value and check it is unique in pool + if ($this->_source($url) && !in_array($url, $this->source)) + { + $this->source[] = $url; + + return true; + } + + return false; + } + + // Begin crawler task + public function start( + int $offset = 0 + ): void + { + // Apply delay to prevent source overload + if ($offset) + { + sleep( + $this->option->delay + ); + } + + // Check for crawl queue completed + if (!isset($this->source[$offset])) + { + print( + $this->_summary() + ); + + return; // stop + } + + // Dump source address + print( + Message::blue( + $this->source[$offset], + true + ) + ); + + // Parse source address + $source = new Address( + $this->source[$offset] + ); + + // Build request + $request = new Request( + $source->get() + ); + + // Track request time + $time = microtime(true); + + // Parse response + $response = new Response( + $bin = $request->getResponse() + ); + + // Calculate response time + $this->time += $time = microtime(true) - $time; // @TODO to API + + // Check response code success + if (20 === $response->getCode()) + { + print( + Message::magenta( + sprintf( + _("\tcode: %d"), + $response->getCode() + ) + ) + ); + } + + else + { + print( + Message::red( + sprintf( + _("\tcode: %d"), + intval( + $response->getCode() + ) + ) + ) + ); + + // Crawl next address... + if ($this->option->crawl) + { + $this->start( + $offset + 1 + ); + } + } + + // Calculate document size + $this->size += $size = (int) strlen($bin); + + print( + Message::magenta( + sprintf( + _("\tsize: %d"), + $size + ) + ) + ); + + // Get meta headers info + if ($response->getMeta()) + { + print( + Message::magenta( + sprintf( + _("\tmeta: %s"), + $response->getMeta() + ) + ) + ); + } + + print( + Message::magenta( + sprintf( + _("\ttime: %f -d %f"), + $time, + $this->option->delay + $time + ) + ) + ); + + // Set downloader mode + $raw = ($this->option->raw || !str_contains((string) $response->getMeta(), 'text/gemini')); + + // Parse gemtext + if (!$raw && $this->option->crawl) + { + // Reset skipped links + $skip = 0; + + // Parse gemtext content + $document = new Document( + $response->getBody() + ); + + // Get link entities + foreach ($document->getLinks() as $link) + { + // Build new address + $address = new Address( + $link->getAddress() + ); + + // Make relative links absolute + $address->toAbsolute( + $source + ); + + // Check link match common source rules, @TODO --external links + if (!$this->_source($address->get()) || $address->getHost() != $source->getHost()) + { + $skip++; + + $this->skip += $skip; + + continue; + } + + // Address --keep not requested + if (!$this->option->keep) + { + // Replace link to local path + $link->setAddress( + $this->filesystem->getFilenameFromNetAddress( + $address, + $this->option->index, + ) + ); + } + + // Append new address to crawler pool + $this->addSource( + $address->get() + ); + } + } + + // Build document filesystem location + $filename = $this->filesystem->getFilenameFromNetAddress( + $source, + $this->option->index + ); + + // Save document to file + $result = $this->filesystem->save( + $filename, + $raw || empty($document) ? $response->getBody() + : $document->toString() + ); + + // Debug FS + if ($result) + { + print( + Message::green( + _("\tsave: ") . $filename + ) + ); + + $this->save++; + } + + else + { + print( + Message::red( + _("\tfail: ") . $filename + ) + ); + + $this->fail++; + } + + // Crawl mode enabled + if ($this->option->crawl) + { + // Crawl next + $this->start( + $offset + 1 + ); + } + } + + public static function exception( + string $message, + ?string $help = null + ): void + { + print( + Message::red( + $message + ) + ); + + if ($help) + { + print( + Message::plain( + $help + ) + ); + } + } + + // Local helpers + private function _source( + string $value + ): bool + { + // Supported Gemini protocol links only + if(!str_starts_with($value, 'gemini://')) + { + return false; + } + + // Make sure link --match option + if (!preg_match($this->option->match, $value)) + { + return false; + } + + return true; + } + + private function _summary(): string + { + return implode( + PHP_EOL, + [ + PHP_EOL, + Message::blue( + _('----------------') + ), + Message::blue( + _('crawl completed!'), + true + ), + Message::magenta( + sprintf( + _("\tdocs: %d"), + count( + $this->source + ) + ) + ), + Message::magenta( + sprintf( + _("\tsave: %d"), + $this->save + ) + ), + Message::magenta( + sprintf( + _("\tskip: %d"), + $this->skip + ) + ), + Message::magenta( + sprintf( + _("\tfail: %d"), + $this->fail + ) + ), + Message::magenta( + sprintf( + _("\tsize: %d"), + $this->size + ) + ), + Message::magenta( + sprintf( + _("\ttime: %f -d %f"), + $this->time, + $this->option->delay * count( + $this->source + ) + $this->time + ) + ), + PHP_EOL + ] + ); + } +} \ No newline at end of file diff --git a/src/Model/Cli/Message.php b/src/Model/Cli/Message.php new file mode 100755 index 0000000..b87f473 --- /dev/null +++ b/src/Model/Cli/Message.php @@ -0,0 +1,60 @@ +crawl = boolval( + isset($options['crawl']) || isset($options['c']) || $this->crawl + ); + + $this->delay = intval( + $options['delay'] ?? $options['d'] ?? $this->delay + ); + + $this->external = boolval( + isset($options['external']) || isset($options['e']) || $this->external + ); + + $this->follow = intval( + $options['follow'] ?? $options['f'] ?? $this->follow + ); + + $this->help = boolval( + isset($options['help']) || isset($options['h']) || $this->help + ); + + $this->index = strval( + $options['index'] ?? $options['i'] ?? $this->index + ); + + $this->keep = boolval( + isset($options['keep']) || isset($options['k']) || $this->keep + ); + + $this->level = intval( + $options['level'] ?? $options['l'] ?? $this->level + ); + + $this->match = strval( + $options['match'] ?? $options['m'] ?? $this->match + ); + + $this->raw = boolval( + isset($options['raw']) || isset($options['r']) || $this->raw + ); + + $this->source = strval( + $options['source'] ?? $options['s'] ?? null + ); + + $this->target = strval( + $options['target'] ?? $options['t'] ?? null + ); + + $this->unique = boolval( + isset($options['unique']) || isset($options['u']) || $this->unique + ); + + // Throw help @TODO + if ($this->help) + { + throw new \Exception; + } + + // Validate source + if (empty($this->source)) + { + throw new \Exception( + _('--source argument required!') + ); + } + + if(!str_starts_with($this->source, 'gemini://')) + { + throw new \Exception( + _('--source protocol not supported!') + ); + } + + if (!preg_match($this->match, $this->source)) + { + throw new \Exception( + _('--source does not --match condition!') + ); + } + + // Validate target + if (empty($this->target)) + { + throw new \Exception( + _('--target argument required!') + ); + } + + if (!is_dir($this->target)) + { + throw new \Exception( + _('--target location not exists!') + ); + } + + if (!is_readable($this->target)) + { + throw new \Exception( + _('--target location not readable!') + ); + } + + if (!is_writable($this->target)) + { + throw new \Exception( + _('--target location not writable!') + ); + } + + // Validate index + if (!$extension = pathinfo($this->index, PATHINFO_EXTENSION)) + { + throw new \Exception( + _('--index filename must have extension!') + ); + } + } +} \ No newline at end of file diff --git a/src/Model/Filesystem.php b/src/Model/Filesystem.php new file mode 100755 index 0000000..064dcbf --- /dev/null +++ b/src/Model/Filesystem.php @@ -0,0 +1,177 @@ +_filepath = realpath( + $directory + ) . DIRECTORY_SEPARATOR; + + if ($version) + { + $this->_filepath .= $version . DIRECTORY_SEPARATOR; + } + } + } + + public function getFilepath(): string + { + return $this->_filepath; + } + + public function getFilenameFromNetAddress( + \Yggverse\Net\Address $address, + ?string $index = null + ): ?string + { + switch (true) + { + case empty($address->get()): + + throw new \Exception( + _('Incorrect target address') + ); + + return null; + + break; + + case empty($address->getScheme()): + + throw new \Exception( + _('Scheme required for target address') + ); + + return null; + + break; + } + + $filename = $this->_filepath . str_replace( + [ + $address->getScheme() . '://', + '/' + ], + [ + null, + DIRECTORY_SEPARATOR + ], + $address->get() + ); + + if ($index && (str_ends_with($filename, '/') || !pathinfo($filename, PATHINFO_EXTENSION) || basename($filename) == $address->getHost())) + { + $filename = rtrim( + $filename, + DIRECTORY_SEPARATOR + ) . DIRECTORY_SEPARATOR . $index; + } + + if (is_dir($filename)) + { + throw new \Exception( + _('Target filename linked to directory') + ); + } + + return $filename; + } + + public function save( + string $filename, + string $data + ): bool + { + if (!str_starts_with($filename, $this->_filepath)) + { + throw new \Exception( + _('Target filename out of storage location') + ); + } + + $filepath = str_replace( + basename( + $filename + ), + '', + $filename + ); + + @mkdir( + $filepath, + 0777, // @TODO be careful with leading zero + true + ); + + if (!is_dir($filepath)) + { + throw new \Exception( + _('Could not create target directory') + ); + } + + if (!is_writable($filepath)) + { + throw new \Exception( + _('Target directory is not readable') + ); + } + + if (!is_writable($filepath)) + { + throw new \Exception( + _('Target directory is not writable') + ); + } + + return (bool) file_put_contents( + $filename, + $data + ); + } +} \ No newline at end of file diff --git a/src/gemini-dl.php b/src/gemini-dl.php new file mode 100755 index 0000000..15befeb --- /dev/null +++ b/src/gemini-dl.php @@ -0,0 +1,51 @@ +#!/usr/bin/env php + +start(); +} + +// Something went wrong +catch (\Exception $data) +{ + Cli::exception( + $data->getMessage(), + file_get_contents( + __DIR__ . + DIRECTORY_SEPARATOR . '..' . + DIRECTORY_SEPARATOR . 'help.gmi' + ) + ); +} \ No newline at end of file