From 2961045c766545234b848a079ebe883bdb2e8df7 Mon Sep 17 00:00:00 2001 From: ghost Date: Mon, 27 Nov 2023 19:29:17 +0200 Subject: [PATCH] implement index cleaner tool #5 --- README.md | 9 +++++ src/cli/document/clean.php | 80 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 src/cli/document/clean.php diff --git a/README.md b/README.md index 1e9e376..8264395 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,15 @@ php src/cli/document/add.php URL php src/cli/document/crawl.php ``` +##### Clean + +``` +php src/cli/document/clean.php +``` + +* remove `url` duplicates +* make index optimization + ##### Search ``` diff --git a/src/cli/document/clean.php b/src/cli/document/clean.php new file mode 100644 index 0000000..a1b8cde --- /dev/null +++ b/src/cli/document/clean.php @@ -0,0 +1,80 @@ + $config->manticore->server->host, + 'port' => $config->manticore->server->port, + ] +); + +// Init index +$index = $client->index( + $config->manticore->index->document->name +); + +// Get totals +$total = $index->search('') + ->option('cutoff', 0) + ->limit(0) + ->get() + ->getTotal(); + +// Delete duplicates #5 +$delete = []; + +foreach($index->search('')->limit($total)->get() as $queue) +{ + $duplicates = $index->search('')->filter('crc32url', $queue->crc32url)->limit($total)->get(); + + if ($duplicates->getTotal() > 1) + { + foreach ($duplicates as $duplicate) + { + $delete[$duplicate->crc32url][] = $duplicate->getId(); + } + } +} + +$i = 0; +foreach ($delete as $crc32url => $ids) +{ + $j = 0; + foreach ($ids as $id) + { + $i++; + $j++; + + // Skip first link + if ($j == 1) continue; + + // Delete duplicate + $index->deleteDocument($id); + } +} + +// Free mem +$delete = []; + +// Dump operation result +echo sprintf( + _('duplicated URLs deleted: %s') . PHP_EOL, + number_format($i) +); + +// Optimize indexes +echo _('indexes optimization begin') . PHP_EOL; + +$index->optimize(); + +echo _('indexes optimization completed') . PHP_EOL; \ No newline at end of file