diff --git a/README.md b/README.md index 58cb0f7..12c20ea 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,6 @@ php src/cli/document/crawl.php php src/cli/document/clean.php ``` -* remove `url` duplicates * make index optimization ##### Search diff --git a/src/cli/document/add.php b/src/cli/document/add.php index 4e62fe0..53ae617 100644 --- a/src/cli/document/add.php +++ b/src/cli/document/add.php @@ -29,7 +29,7 @@ $crc32url = crc32($url); // Check URL for exist $result = $index->search('') - ->filter('crc32url', $crc32url) + ->filter('id', $crc32url) ->limit(1) ->get(); @@ -47,9 +47,9 @@ if ($result->getTotal()) // Add $result = $index->addDocument( [ - 'url' => $url, - 'crc32url' => $crc32url - ] + 'url' => $url + ], + $crc32url ); echo sprintf( diff --git a/src/cli/document/clean.php b/src/cli/document/clean.php index 8e85d6a..a2e86b8 100644 --- a/src/cli/document/clean.php +++ b/src/cli/document/clean.php @@ -36,57 +36,6 @@ $index = $client->index( $config->manticore->index->document->name ); -// Get totals -$total = $index->search('') - ->option('cutoff', 0) - ->limit(0) - ->get() - ->getTotal(); - -// Delete duplicates #5 -$delete = []; - -foreach($index->search('')->limit($total)->get() as $queue) -{ - $duplicates = $index->search('')->filter('crc32url', $queue->crc32url)->limit($total)->get(); - - if ($duplicates->getTotal() > 1) - { - foreach ($duplicates as $duplicate) - { - $delete[$duplicate->crc32url][] = $duplicate->getId(); - } - } -} - -$i = 0; -foreach ($delete as $crc32url => $ids) -{ - $j = 0; - foreach ($ids as $id) - { - $i++; - $j++; - - // Skip first link - if ($j == 1) continue; - - // Delete duplicate - $index->deleteDocument($id); - } -} - -// Free mem -$delete = []; - -// @TODO $config->cli->document->crawl->skip->stripos->url - -// Dump operation result -echo sprintf( - _('duplicated URLs deleted: %s') . PHP_EOL, - number_format($i) -); - // Optimize indexes echo _('indexes optimization begin') . PHP_EOL; diff --git a/src/cli/document/crawl.php b/src/cli/document/crawl.php index 7a49873..82e1f87 100644 --- a/src/cli/document/crawl.php +++ b/src/cli/document/crawl.php @@ -98,7 +98,6 @@ foreach($index->search('') $data = [ 'url' => $document->get('url'), - 'crc32url' => $document->get('crc32url'), 'title' => $document->get('title'), 'description' => $document->get('description'), 'keywords' => $document->get('keywords'), @@ -355,17 +354,20 @@ foreach($index->search('') $crc32url = crc32($url); if (!$index->search('') - ->filter('crc32url', $crc32url) + ->filter('id', $crc32url) ->limit(1) ->get() ->getTotal()) { + echo 'add'; + /* $index->addDocument( [ - 'url' => $url, - 'crc32url' => $crc32url - ] + 'url' => $url + ], + $crc32url ); + */ if ($config->cli->document->crawl->debug->level->notice) { @@ -383,10 +385,16 @@ foreach($index->search('') // Replace document data // https://github.com/manticoresoftware/manticoresearch-php/issues/10#issuecomment-612685916 + + // @TODO optimization for replacements required + // https://manual.manticoresearch.com/Data_creation_and_modification/Updating_documents/REPLACE + echo 'replace'; + /* $result = $index->replaceDocument( $data, $document->getId() ); + */ // Debug result if ($config->cli->document->crawl->debug->level->notice) diff --git a/src/cli/index/init.php b/src/cli/index/init.php index 0c6fdb5..e35fd2c 100644 --- a/src/cli/index/init.php +++ b/src/cli/index/init.php @@ -79,10 +79,6 @@ $result = $index->create( 'time' => [ 'type' => 'integer' - ], - 'crc32url' => - [ - 'type' => 'bigint' ] ], (array) $config->manticore->index->document->settings diff --git a/src/cli/yggo/import.php b/src/cli/yggo/import.php index 7116cde..26a9a89 100644 --- a/src/cli/yggo/import.php +++ b/src/cli/yggo/import.php @@ -128,7 +128,7 @@ for ($i = 0; $i <= $total; $i++) if (isset($argv[6])) { $local = $index->search('') - ->filter('crc32url', $crc32url) + ->filter('id', $crc32url) ->limit(1) ->get(); @@ -149,7 +149,6 @@ for ($i = 0; $i <= $total; $i++) $index->addDocument( [ 'url' => $url, - 'crc32url' => (int) $crc32url, 'time' => (int) $remote->timeUpdated, 'code' => (int) $remote->httpCode, 'size' => (int) $remote->size, @@ -157,7 +156,8 @@ for ($i = 0; $i <= $total; $i++) 'title' => (string) $remote->title, 'description' => (string) $remote->description, 'keywords' => (string) $remote->keywords - ] + ], + (int) $crc32url ); // Result diff --git a/src/webui/search.php b/src/webui/search.php index 64161a3..d2466c7 100644 --- a/src/webui/search.php +++ b/src/webui/search.php @@ -60,7 +60,7 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE // Check URL for exist $exist = $index->search('') - ->filter('crc32url', $crc32url) + ->filter('id', $crc32url) ->limit(1) ->get() ->getTotal(); @@ -82,9 +82,9 @@ if ($config->webui->search->index->request->url->enabled && filter_var($q, FILTE $index->addDocument( [ - 'url' => $url, - 'crc32url' => $crc32url - ] + 'url' => $url + ], + $crc32url ); $response = sprintf( @@ -113,7 +113,7 @@ switch (true) case filter_var($q, FILTER_VALIDATE_URL): - $query = $index->search('')->filter('crc32url', crc32($q)); + $query = $index->search('')->filter('id', crc32($q)); break;