Update Lexer/Extractor

This commit is contained in:
Daniel Supernault 2018-11-20 21:05:10 -07:00
parent 952f3aa0b2
commit 685df5f690
No known key found for this signature in database
GPG key ID: 0DEF1C662C9033F7

View file

@ -98,7 +98,6 @@ class Extractor extends Regex
$entities = array_merge($entities, $this->extractURLsWithIndices($tweet)); $entities = array_merge($entities, $this->extractURLsWithIndices($tweet));
$entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false)); $entities = array_merge($entities, $this->extractHashtagsWithIndices($tweet, false));
$entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet)); $entities = array_merge($entities, $this->extractMentionsOrListsWithIndices($tweet));
$entities = array_merge($entities, $this->extractCashtagsWithIndices($tweet));
$entities = $this->removeOverlappingEntities($entities); $entities = $this->removeOverlappingEntities($entities);
return $entities; return $entities;
@ -303,33 +302,6 @@ class Extractor extends Regex
*/ */
public function extractCashtagsWithIndices($tweet = null) public function extractCashtagsWithIndices($tweet = null)
{ {
if (is_null($tweet)) {
$tweet = $this->tweet;
}
if (!preg_match('/\$/iu', $tweet)) {
return [];
}
preg_match_all(self::$patterns['valid_cashtag'], $tweet, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE);
$tags = [];
foreach ($matches as $match) {
list($all, $before, $dollar, $cash_text, $outer) = array_pad($match, 3, ['', 0]);
$start_position = $dollar[1] > 0 ? StringUtils::strlen(substr($tweet, 0, $dollar[1])) : $dollar[1];
$end_position = $start_position + StringUtils::strlen($dollar[0].$cash_text[0]);
if (preg_match(self::$patterns['end_hashtag_match'], $outer[0])) {
continue;
}
$tags[] = [
'cashtag' => $cash_text[0],
'indices' => [$start_position, $end_position],
];
}
return $tags;
} }
/** /**