diff --git a/app/Http/Controllers/Admin/AdminAutospamController.php b/app/Http/Controllers/Admin/AdminAutospamController.php new file mode 100644 index 000000000..8adc45312 --- /dev/null +++ b/app/Http/Controllers/Admin/AdminAutospamController.php @@ -0,0 +1,255 @@ +whereNull('appeal_handled_at')->count(); + }); + + $closed = Cache::remember('admin-dash:reports:spam-count-closed', 3600, function() { + return AccountInterstitial::whereType('post.autospam')->whereNotNull('appeal_handled_at')->count(); + }); + + $thisWeek = Cache::remember('admin-dash:reports:spam-count-stats-this-week ', 86400, function() { + $sr = config('database.default') == 'pgsql' ? "to_char(created_at, 'MM-YYYY')" : "DATE_FORMAT(created_at, '%m-%Y')"; + $gb = config('database.default') == 'pgsql' ? [DB::raw($sr)] : DB::raw($sr); + $s = AccountInterstitial::select( + DB::raw('count(id) as count'), + DB::raw($sr . " as month_year") + ) + ->where('created_at', '>=', now()->subWeeks(52)) + ->groupBy($gb) + ->get() + ->map(function($s) { + $dt = now()->parse('01-' . $s->month_year); + return [ + 'id' => $dt->format('Ym'), + 'x' => $dt->format('M Y'), + 'y' => $s->count + ]; + }) + ->sortBy('id') + ->values() + ->toArray(); + return $s; + }); + + $files = [ + 'spam' => [ + 'exists' => Storage::exists(AutospamService::MODEL_SPAM_PATH), + 'size' => 0 + ], + 'ham' => [ + 'exists' => Storage::exists(AutospamService::MODEL_HAM_PATH), + 'size' => 0 + ], + 'combined' => [ + 'exists' => Storage::exists(AutospamService::MODEL_FILE_PATH), + 'size' => 0 + ] + ]; + + if($files['spam']['exists']) { + $files['spam']['size'] = Storage::size(AutospamService::MODEL_SPAM_PATH); + } + + if($files['ham']['exists']) { + $files['ham']['size'] = Storage::size(AutospamService::MODEL_HAM_PATH); + } + + if($files['combined']['exists']) { + $files['combined']['size'] = Storage::size(AutospamService::MODEL_FILE_PATH); + } + + return [ + 'autospam_enabled' => (bool) config_cache('pixelfed.bouncer.enabled') ?? false, + 'nlp_enabled' => (bool) AutospamService::active(), + 'files' => $files, + 'open' => $open, + 'closed' => $closed, + 'graph' => collect($thisWeek)->map(fn($s) => $s['y'])->values(), + 'graphLabels' => collect($thisWeek)->map(fn($s) => $s['x'])->values() + ]; + } + + public function getAutospamReportsClosedApi(Request $request) + { + $appeals = AdminSpamReport::collection( + AccountInterstitial::orderBy('id', 'desc') + ->whereType('post.autospam') + ->whereIsSpam(true) + ->whereNotNull('appeal_handled_at') + ->cursorPaginate(6) + ->withQueryString() + ); + + return $appeals; + } + + public function postAutospamTrainSpamApi(Request $request) + { + $aiCount = AccountInterstitial::whereItemType('App\Status') + ->whereIsSpam(true) + ->count(); + abort_if($aiCount < 100, 422, 'You don\'t have enough data to pre-train against.'); + + $existing = Cache::get('pf:admin:autospam:pretrain:recent'); + abort_if($existing, 422, 'You\'ve already run this recently, please wait 30 minutes before pre-training again'); + AutospamPretrainPipeline::dispatch(); + Cache::put('pf:admin:autospam:pretrain:recent', 1, 1440); + + return [ + 'msg' => 'Success!' + ]; + } + + public function postAutospamTrainNonSpamSearchApi(Request $request) + { + $this->validate($request, [ + 'q' => 'required|string|min:1' + ]); + + $q = $request->input('q'); + + $res = Profile::whereNull(['status', 'domain']) + ->where('username', 'like', '%' . $q . '%') + ->orderByDesc('followers_count') + ->take(10) + ->get() + ->map(function($p) { + $acct = AccountService::get($p->id, true); + return [ + 'id' => (string) $p->id, + 'avatar' => $acct['avatar'], + 'username' => $p->username + ]; + }) + ->values(); + return $res; + } + + public function postAutospamTrainNonSpamSubmitApi(Request $request) + { + $this->validate($request, [ + 'accounts' => 'required|array|min:1|max:10' + ]); + + $accts = $request->input('accounts'); + + $accounts = Profile::whereNull(['domain', 'status'])->find(collect($accts)->map(function($a) { return $a['id'];})); + + abort_if(!$accounts || !$accounts->count(), 422, 'One or more of the selected accounts are not valid'); + + AutospamPretrainNonSpamPipeline::dispatch($accounts); + return $accounts; + } + + public function getAutospamCustomTokensApi(Request $request) + { + return AutospamCustomTokens::latest()->cursorPaginate(6); + } + + public function saveNewAutospamCustomTokensApi(Request $request) + { + $this->validate($request, [ + 'token' => 'required|unique:autospam_custom_tokens,token', + ]); + + $ct = new AutospamCustomTokens; + $ct->token = $request->input('token'); + $ct->weight = $request->input('weight'); + $ct->category = $request->input('category') === 'spam' ? 'spam' : 'ham'; + $ct->note = $request->input('note'); + $ct->active = $request->input('active'); + $ct->save(); + + AutospamUpdateCachedDataPipeline::dispatch(); + return $ct; + } + + public function updateAutospamCustomTokensApi(Request $request) + { + $this->validate($request, [ + 'id' => 'required', + 'token' => 'required', + 'category' => 'required|in:spam,ham', + 'active' => 'required|boolean' + ]); + + $ct = AutospamCustomTokens::findOrFail($request->input('id')); + $ct->weight = $request->input('weight'); + $ct->category = $request->input('category'); + $ct->note = $request->input('note'); + $ct->active = $request->input('active'); + $ct->save(); + + AutospamUpdateCachedDataPipeline::dispatch(); + + return $ct; + } + + public function exportAutospamCustomTokensApi(Request $request) + { + abort_if(!Storage::exists(AutospamService::MODEL_SPAM_PATH), 422, 'Autospam Dataset does not exist, please train spam before attempting to export'); + return Storage::download(AutospamService::MODEL_SPAM_PATH); + } + + public function enableAutospamApi(Request $request) + { + ConfigCacheService::put('autospam.nlp.enabled', true); + Cache::forget(AutospamService::CHCKD_CACHE_KEY); + return ['msg' => 'Success']; + } + + public function disableAutospamApi(Request $request) + { + ConfigCacheService::put('autospam.nlp.enabled', false); + Cache::forget(AutospamService::CHCKD_CACHE_KEY); + return ['msg' => 'Success']; + } +} diff --git a/app/Http/Controllers/AdminController.php b/app/Http/Controllers/AdminController.php index caa9c19fc..e54908a41 100644 --- a/app/Http/Controllers/AdminController.php +++ b/app/Http/Controllers/AdminController.php @@ -21,6 +21,7 @@ use Carbon\Carbon; use Illuminate\Http\Request; use Illuminate\Support\Facades\Redis; use App\Http\Controllers\Admin\{ + AdminAutospamController, AdminDirectoryController, AdminDiscoverController, AdminHashtagsController, @@ -43,6 +44,7 @@ use App\Models\CustomEmoji; class AdminController extends Controller { use AdminReportController, + AdminAutospamController, AdminDirectoryController, AdminDiscoverController, AdminHashtagsController, diff --git a/app/Jobs/AutospamPipeline/AutospamPretrainNonSpamPipeline.php b/app/Jobs/AutospamPipeline/AutospamPretrainNonSpamPipeline.php new file mode 100644 index 000000000..348f8e0ea --- /dev/null +++ b/app/Jobs/AutospamPipeline/AutospamPretrainNonSpamPipeline.php @@ -0,0 +1,58 @@ +accounts = $accounts; + $this->classifier = new Classifier(); + } + + /** + * Execute the job. + */ + public function handle(): void + { + $classifier = $this->classifier; + $accounts = $this->accounts; + + foreach($accounts as $acct) { + Status::whereNotNull('caption') + ->whereScope('public') + ->whereProfileId($acct->id) + ->inRandomOrder() + ->take(400) + ->pluck('caption') + ->each(function($c) use ($classifier) { + $classifier->learn($c, 'ham'); + }); + } + + Storage::put(AutospamService::MODEL_HAM_PATH, $classifier->export()); + + AutospamUpdateCachedDataPipeline::dispatch()->delay(5); + } +} diff --git a/app/Jobs/AutospamPipeline/AutospamPretrainPipeline.php b/app/Jobs/AutospamPipeline/AutospamPretrainPipeline.php new file mode 100644 index 000000000..f1f637c37 --- /dev/null +++ b/app/Jobs/AutospamPipeline/AutospamPretrainPipeline.php @@ -0,0 +1,63 @@ +classifier = new Classifier(); + } + + /** + * Execute the job. + */ + public function handle(): void + { + $classifier = $this->classifier; + + $aiCount = AccountInterstitial::whereItemType('App\Status') + ->whereIsSpam(true) + ->count(); + + if($aiCount < 100) { + return; + } + + AccountInterstitial::whereItemType('App\Status') + ->whereIsSpam(true) + ->inRandomOrder() + ->take(config('autospam.nlp.spam_sample_limit')) + ->pluck('item_id') + ->each(function ($ai) use($classifier) { + $status = Status::whereNotNull('caption')->find($ai); + if(!$status) { + return; + } + $classifier->learn($status->caption, 'spam'); + }); + + Storage::put(AutospamService::MODEL_SPAM_PATH, $classifier->export()); + + AutospamUpdateCachedDataPipeline::dispatch()->delay(5); + } +} diff --git a/app/Jobs/AutospamPipeline/AutospamUpdateCachedDataPipeline.php b/app/Jobs/AutospamPipeline/AutospamUpdateCachedDataPipeline.php new file mode 100644 index 000000000..c223e74f7 --- /dev/null +++ b/app/Jobs/AutospamPipeline/AutospamUpdateCachedDataPipeline.php @@ -0,0 +1,79 @@ +get(); + foreach($newSpam as $ns) { + $key = strtolower($ns->token); + if(isset($spam['words']['spam'][$key])) { + $spam['words']['spam'][$key] = $spam['words']['spam'][$key] + $ns->weight; + } else { + $spam['words']['spam'][$key] = $ns->weight; + } + } + $newSpamCount = count($spam['words']['spam']); + $spam['documents']['spam'] = $newSpamCount; + arsort($spam['words']['spam']); + Storage::put(AutospamService::MODEL_SPAM_PATH, json_encode($spam, JSON_UNESCAPED_SLASHES|JSON_PRETTY_PRINT)); + + $ham = json_decode(Storage::get(AutospamService::MODEL_HAM_PATH), true); + $newHam = AutospamCustomTokens::whereCategory('ham')->get(); + foreach($newHam as $ns) { + $key = strtolower($ns->token); + if(isset($spam['words']['ham'][$key])) { + $ham['words']['ham'][$key] = $ham['words']['ham'][$key] + $ns->weight; + } else { + $ham['words']['ham'][$key] = $ns->weight; + } + } + + $newHamCount = count($ham['words']['ham']); + $ham['documents']['ham'] = $newHamCount; + arsort($ham['words']['ham']); + + Storage::put(AutospamService::MODEL_HAM_PATH, json_encode($ham, JSON_UNESCAPED_SLASHES|JSON_PRETTY_PRINT)); + + $combined = [ + 'documents' => [ + 'spam' => $newSpamCount, + 'ham' => $newHamCount, + ], + 'words' => [ + 'spam' => $spam['words']['spam'], + 'ham' => $ham['words']['ham'] + ] + ]; + + Storage::put(AutospamService::MODEL_FILE_PATH, json_encode($combined, JSON_PRETTY_PRINT,JSON_UNESCAPED_SLASHES)); + Cache::forget(AutospamService::MODEL_CACHE_KEY); + Cache::forget(AutospamService::CHCKD_CACHE_KEY); + } +} diff --git a/app/Services/AutospamService.php b/app/Services/AutospamService.php new file mode 100644 index 000000000..6986e81e4 --- /dev/null +++ b/app/Services/AutospamService.php @@ -0,0 +1,78 @@ +import($model['documents'], $model['words']); + return $classifier->most($text) === 'spam'; + } + + public static function eligible() + { + return Cache::remember(self::CHCKD_CACHE_KEY, 86400, function() { + if(!config_cache('pixelfed.bouncer.enabled') || !config('autospam.enabled')) { + return false; + } + + if(!Storage::exists(self::MODEL_SPAM_PATH)) { + return false; + } + + if(!Storage::exists(self::MODEL_HAM_PATH)) { + return false; + } + + if(!Storage::exists(self::MODEL_FILE_PATH)) { + return false; + } else { + if(Storage::size(self::MODEL_FILE_PATH) < 1000) { + return false; + } + } + + return true; + }); + } + + public static function active() + { + return config_cache('autospam.nlp.enabled') && self::eligible(); + } + + public static function getCachedModel() + { + if(!self::active()) { + return null; + } + + return Cache::remember(self::MODEL_CACHE_KEY, 86400, function() { + $res = Storage::get(self::MODEL_FILE_PATH); + if(!$res || empty($res)) { + return null; + } + + return json_decode($res, true); + }); + } +} diff --git a/app/Services/ConfigCacheService.php b/app/Services/ConfigCacheService.php index 7ecb318e0..9da5c8adc 100644 --- a/app/Services/ConfigCacheService.php +++ b/app/Services/ConfigCacheService.php @@ -69,7 +69,9 @@ class ConfigCacheService 'instance.landing.show_directory', 'instance.landing.show_explore', 'instance.admin.pid', - 'instance.banner.blurhash' + 'instance.banner.blurhash', + + 'autospam.nlp.enabled', // 'system.user_mode' ]; diff --git a/app/Util/Lexer/Classifier.php b/app/Util/Lexer/Classifier.php new file mode 100644 index 000000000..61f7b694c --- /dev/null +++ b/app/Util/Lexer/Classifier.php @@ -0,0 +1,178 @@ + + */ + private $tokenizer; + + /** + * @var array> + */ + private array $words = []; + + /** + * @var array + */ + private array $documents = []; + + private bool $uneven = false; + + /** + * @param callable(string): array $tokenizer + */ + public function setTokenizer(callable $tokenizer): void + { + $this->tokenizer = $tokenizer; + } + + /** + * @return Collection + */ + public function tokenize(string $string): Collection + { + if ($this->tokenizer) { + /** @var array */ + $tokens = call_user_func($this->tokenizer, $string); + + return collect($tokens); + } + + return Str::of($string) + ->lower() + ->matchAll('/[[:alpha:]]+/u'); + } + + /** + * @return $this + */ + public function learn(string $statement, string $type): self + { + foreach ($this->tokenize($statement) as $word) { + $this->incrementWord($type, $word); + } + + $this->incrementType($type); + + return $this; + } + + /** + * @return Collection + */ + public function guess(string $statement): Collection + { + $words = $this->tokenize($statement); + + return collect($this->documents) + ->map(function ($count, string $type) use ($words) { + $likelihood = $this->pTotal($type); + + foreach ($words as $word) { + $likelihood *= $this->p($word, $type); + } + + return (string) BigDecimal::of($likelihood); + }) + ->sortDesc(); + } + + public function most(string $statement): string + { + /** @var string */ + return $this->guess($statement)->keys()->first(); + } + + /** + * @return self + */ + public function uneven(bool $enabled = false): self + { + $this->uneven = $enabled; + + return $this; + } + + /** + * Increment the document count for the type + */ + private function incrementType(string $type): void + { + if (! isset($this->documents[$type])) { + $this->documents[$type] = 0; + } + + $this->documents[$type]++; + } + + /** + * Increment the word count for the given type + */ + private function incrementWord(string $type, string $word): void + { + $ignored = config('autospam.ignored_tokens'); + if(!$ignored) { + $ignored = ['the', 'a', 'of', 'and']; + } else { + $ignored = explode(',', $ignored); + } + if ($type == 'spam' && in_array($word, $ignored)) { + return; + } + if (! isset($this->words[$type][$word])) { + $this->words[$type][$word] = 0; + } + + $this->words[$type][$word]++; + } + + /** + * @return float|int + */ + private function p(string $word, string $type) + { + $count = $this->words[$type][$word] ?? 0; + + return ($count + 1) / (array_sum($this->words[$type]) + 1); + } + + /** + * @return float|int + */ + private function pTotal(string $type) + { + return $this->uneven + ? ($this->documents[$type] + 1) / (array_sum($this->documents) + 1) + : 1; + } + + public function export() + { + $words = $this->words; + $words = collect($words) + ->map(function($w) { + arsort($w); + return $w; + }) + ->all(); + return json_encode([ + '_ns' => 'https://pixelfed.org/ns/nlp', + '_v' => '1.0', + 'documents' => $this->documents, + 'words' => $words + ], JSON_PRETTY_PRINT|JSON_UNESCAPED_SLASHES); + } + + public function import($documents, $words) + { + $this->documents = $documents; + $this->words = $words; + } +} diff --git a/public/js/admin.js b/public/js/admin.js index b2efdf1dd..48d5949b8 100644 Binary files a/public/js/admin.js and b/public/js/admin.js differ diff --git a/public/mix-manifest.json b/public/mix-manifest.json index beeaf8368..2675c6ea9 100644 Binary files a/public/mix-manifest.json and b/public/mix-manifest.json differ diff --git a/resources/views/admin/autospam/home.blade.php b/resources/views/admin/autospam/home.blade.php new file mode 100644 index 000000000..338b546e7 --- /dev/null +++ b/resources/views/admin/autospam/home.blade.php @@ -0,0 +1,12 @@ +@extends('admin.partial.template-full') + +@section('section') + + +@endsection + +@push('scripts') + +@endpush diff --git a/resources/views/admin/settings/sidebar.blade.php b/resources/views/admin/settings/sidebar.blade.php index b6f83bd04..be411198e 100644 --- a/resources/views/admin/settings/sidebar.blade.php +++ b/resources/views/admin/settings/sidebar.blade.php @@ -1,31 +1,76 @@ -@section('menu') -