implement DNS resolver with memory cache feature #15

This commit is contained in:
yggverse 2024-04-07 03:54:55 +03:00
parent 298322a4c3
commit 1b8bcb084a
4 changed files with 209 additions and 12 deletions

View file

@ -17,8 +17,7 @@ $config = json_decode(
$semaphore = sem_get(
crc32(
__DIR__ . '.yo.cli.document.crawl'
),
1
), 1
);
if (false === sem_acquire($semaphore, true))
@ -125,6 +124,34 @@ catch (Exception $exception)
exit;
}
// Init memory
try
{
$memory = new \Memcached();
$memory->addServer(
$config->memcached->server->host,
$config->memcached->server->port
);
}
catch (Exception $exception)
{
if ($config->cli->document->crawl->debug->level->error)
{
echo sprintf(
_('[%s] [error] %s') . PHP_EOL,
date('c'),
print_r(
$exception,
true
)
);
}
exit;
}
// Debug totals
if ($config->cli->document->crawl->debug->level->notice)
{
@ -172,6 +199,125 @@ foreach($index->search('')
$document->get('url'),
$config->manticore->index->document->name
);
} // @TODO
// Init base address
$base = new \Yggverse\Net\Address(
$document->get('url')
);
// Init worker address
$address = new \Yggverse\Net\Address(
$document->get('url')
);
// Custom resolver enabled
if ($config->cli->document->crawl->resolver->enabled
&&
// Host still not resolved
\Yggverse\Net\Valid::domainHostName(
$address->getHost()
)
) {
// Generate memory ID
$id = sprintf(
'%s.%s.resolved',
$config->manticore->index->document->name,
$address->getHost()
);
// Check for cached results
if ($host = $memory->get($id))
{
$address->setHost(
$host
);
// Debug event
if ($config->cli->document->crawl->debug->level->notice)
{
echo sprintf(
_('[%s] [notice] resolve "%s" as "%s" from cache') . PHP_EOL,
date('c'),
$base->getHost(),
$address->getHost()
);
}
}
// Init connection loop until the address will be resolved
else
{
$attempt = 1;
do
{
// Resolve begin
$resolve = new \Yggverse\Net\Resolve(
$config->cli->document->crawl->resolver->records,
$config->cli->document->crawl->resolver->providers,
$config->cli->document->crawl->resolver->connection->timeout,
$config->cli->document->crawl->resolver->result->shuffle
);
$result = [];
$errors = [];
$resolved = $resolve->address(
$address,
$result,
$errors
);
if ($resolved)
{
// Update address
$address = $resolved;
// Update cache
$memory->set(
$id,
$address->getHost(),
$config->cli->document->crawl->resolver->result->cache->timeout + time()
);
// Debug event
if ($config->cli->document->crawl->debug->level->notice)
{
echo sprintf(
_('[%s] [notice] resolve "%s" as "%s"') . PHP_EOL,
date('c'),
$base->getHost(),
$address->getHost()
);
}
}
else
{
// Log event
if ($config->cli->document->crawl->debug->level->warning)
{
echo sprintf(
_('[%s] [warning] could not resolve "%s" (attempt: %d, response: %s), wait for reconnection...') . PHP_EOL,
date('c'),
$base->getHost(),
$attempt++,
print_r(
$errors,
true
)
);
}
// Next connection delay
sleep(
$config->cli->document->crawl->resolver->connection->delay
);
}
} while (!$resolved);
}
}
// Update index time anyway and set reset code to 51
@ -184,9 +330,21 @@ foreach($index->search('')
$document->getId()
);
// Request remote URL
// Prepare remote request
$request = new \Yggverse\Gemini\Client\Request(
$document->get('url')
$address->get()
);
// Apply stream options
$request->setOptions(
[
'ssl' =>
[
'peer_name' => $base->getHost(), // SNI
'verify_peer' => $config->cli->document->crawl->connection->options->ssl->verify_peer,
'verify_peer_name' => $config->cli->document->crawl->connection->options->ssl->verify_peer_name
]
]
);
$response = new \Yggverse\Gemini\Client\Response(
@ -301,10 +459,6 @@ foreach($index->search('')
);
// Crawl links
$base = new \Yggverse\Net\Address(
$document->get('url')
);
$documents = [];
foreach ($body->getLinks() as $line)