set crc32url as document id

This commit is contained in:
ghost 2024-01-22 22:52:37 +02:00
parent 93baed4b90
commit 6f4abe4729
7 changed files with 25 additions and 73 deletions

View file

@ -36,57 +36,6 @@ $index = $client->index(
$config->manticore->index->document->name
);
// Get totals
$total = $index->search('')
->option('cutoff', 0)
->limit(0)
->get()
->getTotal();
// Delete duplicates #5
$delete = [];
foreach($index->search('')->limit($total)->get() as $queue)
{
$duplicates = $index->search('')->filter('crc32url', $queue->crc32url)->limit($total)->get();
if ($duplicates->getTotal() > 1)
{
foreach ($duplicates as $duplicate)
{
$delete[$duplicate->crc32url][] = $duplicate->getId();
}
}
}
$i = 0;
foreach ($delete as $crc32url => $ids)
{
$j = 0;
foreach ($ids as $id)
{
$i++;
$j++;
// Skip first link
if ($j == 1) continue;
// Delete duplicate
$index->deleteDocument($id);
}
}
// Free mem
$delete = [];
// @TODO $config->cli->document->crawl->skip->stripos->url
// Dump operation result
echo sprintf(
_('duplicated URLs deleted: %s') . PHP_EOL,
number_format($i)
);
// Optimize indexes
echo _('indexes optimization begin') . PHP_EOL;