mirror of
https://github.com/YGGverse/Yo.git
synced 2026-03-31 17:55:35 +00:00
set crc32url as document id
This commit is contained in:
parent
93baed4b90
commit
6f4abe4729
7 changed files with 25 additions and 73 deletions
|
|
@ -36,57 +36,6 @@ $index = $client->index(
|
|||
$config->manticore->index->document->name
|
||||
);
|
||||
|
||||
// Get totals
|
||||
$total = $index->search('')
|
||||
->option('cutoff', 0)
|
||||
->limit(0)
|
||||
->get()
|
||||
->getTotal();
|
||||
|
||||
// Delete duplicates #5
|
||||
$delete = [];
|
||||
|
||||
foreach($index->search('')->limit($total)->get() as $queue)
|
||||
{
|
||||
$duplicates = $index->search('')->filter('crc32url', $queue->crc32url)->limit($total)->get();
|
||||
|
||||
if ($duplicates->getTotal() > 1)
|
||||
{
|
||||
foreach ($duplicates as $duplicate)
|
||||
{
|
||||
$delete[$duplicate->crc32url][] = $duplicate->getId();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$i = 0;
|
||||
foreach ($delete as $crc32url => $ids)
|
||||
{
|
||||
$j = 0;
|
||||
foreach ($ids as $id)
|
||||
{
|
||||
$i++;
|
||||
$j++;
|
||||
|
||||
// Skip first link
|
||||
if ($j == 1) continue;
|
||||
|
||||
// Delete duplicate
|
||||
$index->deleteDocument($id);
|
||||
}
|
||||
}
|
||||
|
||||
// Free mem
|
||||
$delete = [];
|
||||
|
||||
// @TODO $config->cli->document->crawl->skip->stripos->url
|
||||
|
||||
// Dump operation result
|
||||
echo sprintf(
|
||||
_('duplicated URLs deleted: %s') . PHP_EOL,
|
||||
number_format($i)
|
||||
);
|
||||
|
||||
// Optimize indexes
|
||||
echo _('indexes optimization begin') . PHP_EOL;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue