mirror of
https://github.com/YGGverse/Yo.git
synced 2026-03-31 17:55:35 +00:00
implement DNS resolver with memory cache feature #15
This commit is contained in:
parent
298322a4c3
commit
1b8bcb084a
4 changed files with 209 additions and 12 deletions
|
|
@ -17,8 +17,7 @@ $config = json_decode(
|
|||
$semaphore = sem_get(
|
||||
crc32(
|
||||
__DIR__ . '.yo.cli.document.crawl'
|
||||
),
|
||||
1
|
||||
), 1
|
||||
);
|
||||
|
||||
if (false === sem_acquire($semaphore, true))
|
||||
|
|
@ -125,6 +124,34 @@ catch (Exception $exception)
|
|||
exit;
|
||||
}
|
||||
|
||||
// Init memory
|
||||
try
|
||||
{
|
||||
$memory = new \Memcached();
|
||||
|
||||
$memory->addServer(
|
||||
$config->memcached->server->host,
|
||||
$config->memcached->server->port
|
||||
);
|
||||
}
|
||||
|
||||
catch (Exception $exception)
|
||||
{
|
||||
if ($config->cli->document->crawl->debug->level->error)
|
||||
{
|
||||
echo sprintf(
|
||||
_('[%s] [error] %s') . PHP_EOL,
|
||||
date('c'),
|
||||
print_r(
|
||||
$exception,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
exit;
|
||||
}
|
||||
|
||||
// Debug totals
|
||||
if ($config->cli->document->crawl->debug->level->notice)
|
||||
{
|
||||
|
|
@ -172,6 +199,125 @@ foreach($index->search('')
|
|||
$document->get('url'),
|
||||
$config->manticore->index->document->name
|
||||
);
|
||||
} // @TODO
|
||||
|
||||
// Init base address
|
||||
$base = new \Yggverse\Net\Address(
|
||||
$document->get('url')
|
||||
);
|
||||
|
||||
// Init worker address
|
||||
$address = new \Yggverse\Net\Address(
|
||||
$document->get('url')
|
||||
);
|
||||
|
||||
// Custom resolver enabled
|
||||
if ($config->cli->document->crawl->resolver->enabled
|
||||
&&
|
||||
// Host still not resolved
|
||||
\Yggverse\Net\Valid::domainHostName(
|
||||
$address->getHost()
|
||||
)
|
||||
) {
|
||||
// Generate memory ID
|
||||
$id = sprintf(
|
||||
'%s.%s.resolved',
|
||||
$config->manticore->index->document->name,
|
||||
$address->getHost()
|
||||
);
|
||||
|
||||
// Check for cached results
|
||||
if ($host = $memory->get($id))
|
||||
{
|
||||
$address->setHost(
|
||||
$host
|
||||
);
|
||||
|
||||
// Debug event
|
||||
if ($config->cli->document->crawl->debug->level->notice)
|
||||
{
|
||||
echo sprintf(
|
||||
_('[%s] [notice] resolve "%s" as "%s" from cache') . PHP_EOL,
|
||||
date('c'),
|
||||
$base->getHost(),
|
||||
$address->getHost()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Init connection loop until the address will be resolved
|
||||
else
|
||||
{
|
||||
$attempt = 1;
|
||||
|
||||
do
|
||||
{
|
||||
// Resolve begin
|
||||
$resolve = new \Yggverse\Net\Resolve(
|
||||
$config->cli->document->crawl->resolver->records,
|
||||
$config->cli->document->crawl->resolver->providers,
|
||||
$config->cli->document->crawl->resolver->connection->timeout,
|
||||
$config->cli->document->crawl->resolver->result->shuffle
|
||||
);
|
||||
|
||||
$result = [];
|
||||
$errors = [];
|
||||
|
||||
$resolved = $resolve->address(
|
||||
$address,
|
||||
$result,
|
||||
$errors
|
||||
);
|
||||
|
||||
if ($resolved)
|
||||
{
|
||||
// Update address
|
||||
$address = $resolved;
|
||||
|
||||
// Update cache
|
||||
$memory->set(
|
||||
$id,
|
||||
$address->getHost(),
|
||||
$config->cli->document->crawl->resolver->result->cache->timeout + time()
|
||||
);
|
||||
|
||||
// Debug event
|
||||
if ($config->cli->document->crawl->debug->level->notice)
|
||||
{
|
||||
echo sprintf(
|
||||
_('[%s] [notice] resolve "%s" as "%s"') . PHP_EOL,
|
||||
date('c'),
|
||||
$base->getHost(),
|
||||
$address->getHost()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
// Log event
|
||||
if ($config->cli->document->crawl->debug->level->warning)
|
||||
{
|
||||
echo sprintf(
|
||||
_('[%s] [warning] could not resolve "%s" (attempt: %d, response: %s), wait for reconnection...') . PHP_EOL,
|
||||
date('c'),
|
||||
$base->getHost(),
|
||||
$attempt++,
|
||||
print_r(
|
||||
$errors,
|
||||
true
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
// Next connection delay
|
||||
sleep(
|
||||
$config->cli->document->crawl->resolver->connection->delay
|
||||
);
|
||||
}
|
||||
|
||||
} while (!$resolved);
|
||||
}
|
||||
}
|
||||
|
||||
// Update index time anyway and set reset code to 51
|
||||
|
|
@ -184,9 +330,21 @@ foreach($index->search('')
|
|||
$document->getId()
|
||||
);
|
||||
|
||||
// Request remote URL
|
||||
// Prepare remote request
|
||||
$request = new \Yggverse\Gemini\Client\Request(
|
||||
$document->get('url')
|
||||
$address->get()
|
||||
);
|
||||
|
||||
// Apply stream options
|
||||
$request->setOptions(
|
||||
[
|
||||
'ssl' =>
|
||||
[
|
||||
'peer_name' => $base->getHost(), // SNI
|
||||
'verify_peer' => $config->cli->document->crawl->connection->options->ssl->verify_peer,
|
||||
'verify_peer_name' => $config->cli->document->crawl->connection->options->ssl->verify_peer_name
|
||||
]
|
||||
]
|
||||
);
|
||||
|
||||
$response = new \Yggverse\Gemini\Client\Response(
|
||||
|
|
@ -301,10 +459,6 @@ foreach($index->search('')
|
|||
);
|
||||
|
||||
// Crawl links
|
||||
$base = new \Yggverse\Net\Address(
|
||||
$document->get('url')
|
||||
);
|
||||
|
||||
$documents = [];
|
||||
|
||||
foreach ($body->getLinks() as $line)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue