init gemini protocol implementation

This commit is contained in:
yggverse 2024-04-03 17:07:28 +03:00
parent 8418a56617
commit 1f96ca8a2c
11 changed files with 204 additions and 2422 deletions

View file

@ -6,7 +6,7 @@ $microtime = microtime(true);
// Load dependencies
require_once __DIR__ . '/../../../vendor/autoload.php';
// Define helpers
// Define helpers @TODO move to separated library (yo-php)
function getLastSnapTime(array $files): int
{
$time = [];
@ -37,6 +37,40 @@ function getLastSnapTime(array $files): int
return 0;
}
function relative2absolute(
string $source, // current document url to grab the base
string $target, // relative or absolute link
?string &$scheme = null,
?string &$host = null,
?int &$port = null
) {
if (!parse_url($target, PHP_URL_HOST))
{
$scheme = parse_url($base, PHP_URL_SCHEME);
$host = parse_url($base, PHP_URL_HOST);
$port = parse_url($base, PHP_URL_PORT);
return $scheme . '://' . $host . ($port ? ':' . $port : null) .
'/' .
trim(
ltrim(
str_replace(
[
'./',
'../'
],
'',
$target
),
'/'
),
'.'
);
}
return $target;
}
// Init config
$config = json_decode(
file_get_contents(
@ -182,16 +216,16 @@ foreach($index->search('')
$data =
[
'url' => $document->get('url'),
'title' => $document->get('title'),
'description' => $document->get('description'),
'keywords' => $document->get('keywords'),
'code' => $document->get('code'),
'size' => $document->get('size'),
'mime' => $document->get('mime'),
'rank' => $document->get('rank'),
'time' => $time,
'index' => 0
'url' => $document->get('url'),
'h1' => $document->get('h1'),
'h2' => $document->get('h2'),
'h3' => $document->get('h3'),
'code' => $document->get('code'),
'size' => $document->get('size'),
'meta' => $document->get('meta'),
'rank' => $document->get('rank'),
'time' => $time,
'index' => 0
];
// Debug target
@ -205,114 +239,50 @@ foreach($index->search('')
);
}
// Update index time anyway and set reset code to 404
// Update index time anyway and set reset code to 51
$index->updateDocument(
[
'time' => time(),
'code' => 200,
'code' => 20,
'index' => 0
],
$document->getId()
);
// Request remote URL
$request = curl_init(
$request = new \Yggverse\Gemini\Client\Request(
$document->get('url')
);
// Drop URL with long response
curl_setopt(
$request,
CURLOPT_CONNECTTIMEOUT,
$config->cli->document->crawl->curl->connection->timeout
);
curl_setopt(
$request,
CURLOPT_TIMEOUT,
$config->cli->document->crawl->curl->connection->timeout
);
// Prevent huge content download e.g. media streams URL
curl_setopt(
$request,
CURLOPT_RETURNTRANSFER,
true
);
curl_setopt(
$request,
CURLOPT_NOPROGRESS,
false
);
curl_setopt(
$request,
CURLOPT_PROGRESSFUNCTION,
function(
$download,
$downloaded,
$upload,
$uploaded
) {
global $config;
global $index;
global $document;
$index->updateDocument(
[
'time' => time(),
'code' => 200,
'index' => 0
],
$document->getId()
);
return $downloaded > $config->cli->document->crawl->curl->download->size->max ? 1 : 0;
}
$response = new \Yggverse\Gemini\Client\Response(
$request->getResponse(
$config->cli->document->crawl->connection->timeout,
$config->cli->document->crawl->connection->length,
$config->cli->document->crawl->connection->chunk,
$length
)
);
// Begin request
if ($response = curl_exec($request))
if ($code = $request->getCode()) // @TODO process redirects
{
// Update HTTP code or skip on empty
if ($code = curl_getinfo($request, CURLINFO_HTTP_CODE))
{
// Delete deprecated document from index as HTTP code still not 200
/*
if ($code != 200 && !empty($data['code']) && $data['code'] != 200)
{
$index->deleteDocument(
$document->getId()
);
continue;
}
*/
$data['code'] = $code;
} else continue;
// Update status code
$data['code'] = $code;
// Update size or skip on empty
if ($size = curl_getinfo($request, CURLINFO_SIZE_DOWNLOAD))
if ($length)
{
$size = round( // float
$size
);
$data['size'] = $size;
$data['size'] = $length;
} else continue;
// Update MIME type or skip on empty
if ($type = curl_getinfo($request, CURLINFO_CONTENT_TYPE))
// Update meta or skip on empty
if ($meta = $response->getMeta())
{
$data['mime'] = $type;
$data['meta'] = $meta;
// On document charset specified
if (preg_match('/charset=([^\s;]+)/i', $type, $charset))
if (preg_match('/charset=([^\s;]+)/i', $meta, $charset))
{
if (!empty($charset[1]))
{
@ -322,10 +292,12 @@ foreach($index->search('')
if (strtolower($charset[1]) == strtolower($encoding))
{
// Convert response to UTF-8
$response = mb_convert_encoding(
$response,
'UTF-8',
$charset[1]
$response->setBody(
mb_convert_encoding(
$response->getBody(),
'UTF-8',
$charset[1]
)
);
break;
@ -336,241 +308,102 @@ foreach($index->search('')
} else continue;
// DOM crawler
if (
false !== stripos($type, 'text/html')
||
false !== stripos($type, 'text/xhtml')
||
false !== stripos($type, 'application/xhtml')
) {
$crawler = new Symfony\Component\DomCrawler\Crawler();
$crawler->addHtmlContent(
$response
// Gemtext parser
if (false !== stripos($response->getMeta(), 'text/gemini'))
{
$body = new \Yggverse\Gemini\Client\Gemtext\Body(
$response->getBody()
);
// Get title
foreach ($crawler->filter('head > title')->each(function($node) {
return $node->text();
}) as $value)
// Get H1
$h1 = [];
foreach ($body->getH1() as $value)
{
if (!empty($value))
{
$data['title'] = trim(
strip_tags(
html_entity_decode(
$value
)
)
);
}
$h1[] = $value;
}
// Get description
foreach ($crawler->filter('head > meta[name="description"]')->each(function($node) {
return $node->attr('content');
}) as $value)
{
if (!empty($value))
{
$data['description'] = trim(
strip_tags(
html_entity_decode(
$value
)
)
);
}
}
// Get keywords
$keywords = [];
// Extract from meta tag
foreach ($crawler->filter('head > meta[name="keywords"]')->each(function($node) {
return $node->attr('content');
}) as $value)
{
if (!empty($value))
{
foreach ((array) explode(
',',
mb_strtolower(
strip_tags(
html_entity_decode(
$value
)
)
)
) as $keyword)
{
// Remove extra spaces
$keyword = trim(
$keyword
);
// Skip short words
if (mb_strlen($keyword) > 2)
{
$keywords[] = $keyword;
}
}
}
}
// Get keywords from headers
/* Disable keywords collection from headers as body index enabled
foreach ($crawler->filter('h1,h2,h3,h4,h5,h6')->each(function($node) {
return $node->text();
}) as $value)
{
if (!empty($value))
{
foreach ((array) explode(
',',
mb_strtolower(
strip_tags(
html_entity_decode(
$value
)
)
)
) as $keyword)
{
// Remove extra spaces
$keyword = trim(
$keyword
);
// Skip short words
if (mb_strlen($keyword) > 2)
{
$keywords[] = $keyword;
}
}
}
}
*/
// Keep keywords unique
$keywords = array_unique(
$keywords
$data['h1'] = implode(
',',
array_unique(
$h1
)
);
// Update previous keywords when new value exists
if ($keywords)
// Get H1
$h2 = [];
foreach ($body->getH2() as $value)
{
$data['keywords'] = implode(',', $keywords);
$h2[] = $value;
}
$data['h2'] = implode(
',',
array_unique(
$h2
)
);
// Get H3
$h3 = [];
foreach ($body->getH3() as $value)
{
$h3[] = $value;
}
$data['h3'] = implode(
',',
array_unique(
$h3
)
);
// Save document body text to index
foreach ($crawler->filter('html > body')->each(function($node) {
$data['body'] = trim(
preg_replace(
'/[\s]{2,}/', // strip extra separators
' ',
$response->getBody()
)
);
return $node->html();
}) as $value)
{
if (!empty($value))
{
$data['body'] = trim(
preg_replace(
'/[\s]{2,}/', // strip extra separators
' ',
strip_tags(
str_replace( // make text separators before strip any closing tag, new line, etc
[
'<',
'>',
PHP_EOL,
],
[
' <',
'> ',
PHP_EOL . ' ',
],
preg_replace(
[
'/<script([^>]*)>([\s\S]*?)<\/script>/i', // strip js content
'/<style([^>]*)>([\s\S]*?)<\/style>/i', // strip css content
'/<pre([^>]*)>([\s\S]*?)<\/pre>/i', // strip code content
'/<code([^>]*)>([\s\S]*?)<\/code>/i',
],
'',
html_entity_decode(
$value
)
)
)
)
)
);
}
}
// Crawl documents
// Crawl links
$documents = [];
$scheme = parse_url($document->get('url'), PHP_URL_SCHEME);
$host = parse_url($document->get('url'), PHP_URL_HOST);
$port = parse_url($document->get('url'), PHP_URL_PORT);
foreach ($config->cli->document->crawl->selector as $selector => $settings)
foreach ($body->getLinks() as $line)
{
foreach ($crawler->filter($selector)->each(function($node) {
$link = new \Yggverse\Gemini\Gemtext\Link(
$line
);
return $node;
if ($url = $link->getAddress())
{
//Make relative links absolute
$url = relative2absolute(
$document->get('url'),
$url,
$scheme,
$host,
$port,
);
}) as $value) {
if ($url = $value->attr($settings->attribute))
// Regex rules
if (!preg_match($config->cli->document->crawl->url->regex, $url))
{
//Make relative links absolute
if (!parse_url($url, PHP_URL_HOST))
{
$url = $scheme . '://' . $host . ($port ? ':' . $port : null) .
'/' .
trim(
ltrim(
str_replace(
[
'./',
'../'
],
'',
$url
),
'/'
),
'.'
);
}
// Regex rules
if (!preg_match($settings->regex, $url))
{
continue;
}
// External host rules
if (!$settings->external && parse_url($url, PHP_URL_HOST) != $host)
{
continue;
}
$documents[] = $url;
continue;
}
// External host rules
if (!$config->cli->document->crawl->url->external && parse_url($url, PHP_URL_HOST) != $host)
{
continue;
}
$documents[] = $url;
}
}
// @TODO find document links by protocol ($body->findLinks('gemini'))
if ($documents)
{
foreach (array_unique($documents) as $url)
@ -578,7 +411,7 @@ foreach($index->search('')
// Apply stripos condition
$skip = false;
foreach ($config->cli->document->crawl->skip->stripos->url as $condition)
foreach ($config->cli->document->crawl->url->skip->stripos as $condition)
{
if (false !== stripos($url, $condition)) {
@ -597,7 +430,7 @@ foreach($index->search('')
date('c'),
$url,
print_r(
$config->cli->document->crawl->skip->stripos->url,
$config->cli->document->crawl->url->skip->stripos,
true
)
);
@ -701,7 +534,7 @@ foreach($index->search('')
}
// Create snap
if ($config->cli->document->crawl->snap->enabled && $code === 200)
if ($config->cli->document->crawl->snap->enabled && $request->getCode() === 20)
{
try
{
@ -734,12 +567,12 @@ foreach($index->search('')
$snap->addFromString(
'DATA',
$response
$response->getBody()
);
$snap->addFromString(
'MIME',
$type
'META',
$response->getMeta()
);
$snap->addFromString(
@ -767,12 +600,12 @@ foreach($index->search('')
// Copy to local storage on enabled
if ($config->snap->storage->local->enabled)
{
// Check for mime allowed
// Check for meta allowed
$allowed = false;
foreach ($config->snap->storage->local->mime->stripos as $whitelist)
foreach ($config->snap->storage->local->meta->stripos as $whitelist)
{
if (false !== stripos($type, $whitelist))
if (false !== stripos($response->getMeta(), $whitelist))
{
$allowed = true;
break;
@ -904,12 +737,12 @@ foreach($index->search('')
continue;
}
// Check for mime allowed
// Check for meta allowed
$allowed = false;
foreach ($ftp->mime->stripos as $whitelist)
foreach ($ftp->meta->stripos as $whitelist)
{
if (false !== stripos($type, $whitelist))
if (false !== stripos($response->getMeta(), $whitelist))
{
$allowed = true;
break;