implement transliteration word forms in search #33

This commit is contained in:
ghost 2023-10-28 00:03:42 +03:00
parent c7c5d7340c
commit 997666ab8e
8 changed files with 144 additions and 58 deletions

View file

@ -229,8 +229,6 @@ class TorrentController extends AbstractController
$activityService
);
//
// Init request
$query = $request->get('query') ? explode(' ', urldecode($request->get('query'))) : [];
$page = $request->get('page') ? (int) $request->get('page') : 1;
@ -883,13 +881,12 @@ class TorrentController extends AbstractController
$file->getPathName(),
(bool) $this->getParameter('app.index.torrent.name'),
(bool) $this->getParameter('app.index.torrent.filenames'),
(bool) $this->getParameter('app.index.torrent.hash.v1'),
(bool) $this->getParameter('app.index.torrent.hash.v2'),
(bool) $this->getParameter('app.index.torrent.source'),
(bool) $this->getParameter('app.index.torrent.comment'),
(bool) $this->getParameter('app.index.transliteration'),
(bool) $this->getParameter('app.index.torrent.name.enabled'),
(bool) $this->getParameter('app.index.torrent.filenames.enabled'),
(bool) $this->getParameter('app.index.torrent.hash.v1.enabled'),
(bool) $this->getParameter('app.index.torrent.hash.v2.enabled'),
(bool) $this->getParameter('app.index.torrent.source.enabled'),
(bool) $this->getParameter('app.index.torrent.comment.enabled'),
(int) $this->getParameter('app.index.word.length.min'),
(int) $this->getParameter('app.index.word.length.max'),
@ -2453,13 +2450,12 @@ class TorrentController extends AbstractController
{
// Reindex keywords
$torrentService->reindexTorrentKeywordsAll(
(bool) $this->getParameter('app.index.torrent.name'),
(bool) $this->getParameter('app.index.torrent.filenames'),
(bool) $this->getParameter('app.index.torrent.hash.v1'),
(bool) $this->getParameter('app.index.torrent.hash.v2'),
(bool) $this->getParameter('app.index.torrent.source'),
(bool) $this->getParameter('app.index.torrent.comment'),
(bool) $this->getParameter('app.index.transliteration'),
(bool) $this->getParameter('app.index.torrent.name.enabled'),
(bool) $this->getParameter('app.index.torrent.filenames.enabled'),
(bool) $this->getParameter('app.index.torrent.hash.v1.enabled'),
(bool) $this->getParameter('app.index.torrent.hash.v2.enabled'),
(bool) $this->getParameter('app.index.torrent.source.enabled'),
(bool) $this->getParameter('app.index.torrent.comment.enabled'),
(int) $this->getParameter('app.index.word.length.min'),
(int) $this->getParameter('app.index.word.length.max')
);

View file

@ -73,27 +73,36 @@ class TorrentRepository extends ServiceEntityRepository
int $userId,
array $keywords,
array $locales,
?bool $sensitive = null,
?bool $approved = null,
?bool $status = null,
?bool $sensitive = null,
?bool $approved = null,
?bool $status = null
): \Doctrine\ORM\QueryBuilder
{
$query = $this->createQueryBuilder('t');
if ($keywords)
{
$andKeywords = $query->expr()->andX();
foreach ($keywords as $i => $keyword)
{
$keyword = mb_strtolower($keyword); // all keywords stored in lowercase
// Make query to the index case insensitive
$keyword = mb_strtolower($keyword);
$andKeywords->add("t.keywords LIKE :keyword{$i}");
// Init OR condition for each word form
$orKeywords = $query->expr()->orX();
$orKeywords->add("t.keywords LIKE :keyword{$i}");
$query->setParameter(":keyword{$i}", "%{$keyword}%");
}
$query->andWhere($andKeywords);
// Generate word forms for each transliteration locale #33
foreach ($this->generateWordForms($keyword) as $j => $wordForm)
{
$orKeywords->add("t.keywords LIKE :keyword{$i}{$j}");
$query->setParameter(":keyword{$i}{$j}", "%{$wordForm}%");
}
// Append AND condition
$query->andWhere($orKeywords);
}
}
if ($locales)
@ -153,4 +162,59 @@ class TorrentRepository extends ServiceEntityRepository
return $query;
}
// Word forms generator to improve search results
// e.g. transliteration rules for latin filenames
private function generateWordForms(
string $keyword,
// #33 supported locales:
// https://github.com/ashtokalo/php-translit
array $transliteration = [
'be',
'bg',
'el',
'hy',
'kk',
'mk',
'ru',
'ka',
'uk'
],
// Additional char forms
array $charForms =
[
'c' => 'k',
'k' => 'c',
]
): array
{
$wordForms = [];
// Apply transliteration
foreach ($transliteration as $locale)
{
$wordForms[] = \ashtokalo\translit\Translit::object()->convert(
$keyword,
$locale
);
}
// Apply char forms
foreach ($wordForms as $wordForm)
{
foreach ($charForms as $from => $to)
{
$wordForms[] = str_replace(
$from,
$to,
$wordForm
);
}
}
// Remove duplicates
return array_unique(
$wordForms
);
}
}

View file

@ -64,7 +64,6 @@ class TorrentService
public function generateTorrentKeywordsByString(
string $string,
bool $transliteration,
int $wordLengthMin,
int $wordLengthMax,
): array
@ -97,11 +96,6 @@ class TorrentService
{
// Apply case insensitive search conversion
$words[$key] = mb_strtolower($value);
if ($transliteration)
{
// @TODO
}
}
}
@ -129,7 +123,6 @@ class TorrentService
bool $extractSource,
bool $extractComment,
bool $wordTransliteration,
int $wordLengthMin,
int $wordLengthMax
@ -147,7 +140,6 @@ class TorrentService
$keywords,
$this->generateTorrentKeywordsByString(
$name,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)
@ -163,7 +155,6 @@ class TorrentService
$keywords,
$this->generateTorrentKeywordsByString(
$list['path'],
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)
@ -179,7 +170,6 @@ class TorrentService
$keywords,
$this->generateTorrentKeywordsByString(
$source,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)
@ -195,7 +185,6 @@ class TorrentService
$keywords,
$this->generateTorrentKeywordsByString(
$comment,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)
@ -301,7 +290,6 @@ class TorrentService
bool $extractSource,
bool $extractComment,
bool $wordTransliteration,
int $wordLengthMin,
int $wordLengthMax,
@ -326,7 +314,6 @@ class TorrentService
$extractInfoHashV2,
$extractSource,
$extractComment,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
),
@ -623,7 +610,6 @@ class TorrentService
bool $extractInfoHashV2,
bool $extractSource,
bool $extractComment,
bool $wordTransliteration,
int $wordLengthMin,
int $wordLengthMax
): void
@ -643,7 +629,6 @@ class TorrentService
$extractInfoHashV2,
$extractSource,
$extractComment,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)