From 13ebc7bcbc6338ee1acf1f6d56b3b25032ae8dd5 Mon Sep 17 00:00:00 2001 From: yggverse Date: Sat, 4 May 2024 08:25:29 +0300 Subject: [PATCH] init sqlite implementation --- .gitignore | 5 +- README.md | 30 +-- composer.json | 4 +- config/example.json | 88 +++++++++ example/config.json | 23 --- src/Model/Database.php | 171 ++++++++++++++++ src/Model/Filter.php | 69 +++++++ src/crawler.php | 434 +++++++++++++++++++++++++++++++---------- 8 files changed, 672 insertions(+), 152 deletions(-) create mode 100644 config/example.json delete mode 100644 example/config.json create mode 100644 src/Model/Database.php create mode 100644 src/Model/Filter.php diff --git a/.gitignore b/.gitignore index a220498..cc0f11d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ /composer.lock -/config.json + +/config/* +!/config/example.json + /server/ /vendor/ diff --git a/README.md b/README.md index b81da0e..c631914 100644 --- a/README.md +++ b/README.md @@ -2,32 +2,18 @@ RSS Aggregator for [Gemini Protocol](https://geminiprotocol.net) -Simple RSS feed converter to static Gemtext format, useful for news portals or localhost reading +## Components + +* [x] `src/crawler.php` - scan configured RSS feeds and dump results to SQLite (see also [FS branch](https://github.com/YGGverse/Pulsar/tree/fs)) +* [ ] `src/nex.php` - Build-in server for [NEX Protocol](https://nightfall.city/nps/info/specification.txt) +* [ ] `src/gemini.php` - Build-in server for [Gemini Protocol](https://geminiprotocol.net) ## Example -* `nex://[301:23b4:991a:634d::feed]/index.gmi` - [Yggdrasil](https://github.com/yggdrasil-network/yggdrasil-go) instance by YGGverse +* `nex://[301:23b4:991a:634d::feed]` - [Yggdrasil](https://github.com/yggdrasil-network/yggdrasil-go) instance by YGGverse ## Usage 1. `git clone https://github.com/YGGverse/Pulsar.git` -2. `cp example/config.json config.json` - setup your feed locations -3. `php src/crawler.php` - grab feeds manually or using crontab - -## Config - -Configuration file supports multiple feed channels with custom settings: - -* `source` - string, filepath or URL to the valid RSS feed -* `target` - string, relative or absolute path to Gemtext dumps -* `item` - * `limit` - integer, how many items to display on page generated - * `template` - string, custom pattern for feed item, that supports following macros - * `{nl}` - new line separator - * `{link}` - item link - * `{guid}` - item guid - * `{pubDate}` - item pubDate, soon with custom time format e.g. `{pubDate:Y-m-d H:s}` - * `{title}` - item title - * `{description}` - item description - -Resulting files could be placed to any local folder (for personal reading) or shared with others (using [gmid](https://github.com/omar-polo/gmid), [twins](https://code.rocket9labs.com/tslocum/twins) or any other [server](https://github.com/kr1sp1n/awesome-gemini#servers) for `gemtext` statics) \ No newline at end of file +2. `cp config/example.json name.json` - setup your feed +3. `php src/crawler.php name.json` - grab feeds manually or using crontabdes \ No newline at end of file diff --git a/composer.json b/composer.json index e9fa09c..1d19097 100644 --- a/composer.json +++ b/composer.json @@ -15,7 +15,5 @@ "name": "YGGverse" } ], - "require": { - "yggverse/titan-ii": "^1.0" - } + "require": {} } diff --git a/config/example.json b/config/example.json new file mode 100644 index 0000000..55b2a3b --- /dev/null +++ b/config/example.json @@ -0,0 +1,88 @@ +{ + "database": + { + "location":"example.sqlite", + "username":null, + "password":null + }, + "crawler": + { + "channel": + [ + { + "source":"https://www.omglinux.com/feed", + "enabled":true, + "item": + { + "link": + { + "enabled":true, + "required":false + }, + "pubDate": + { + "enabled":true, + "required":false + }, + "title": + { + "enabled":true, + "required":false + }, + "description": + { + "enabled":true, + "required":false + }, + "content":{ + "enabled":false, + "required":false + } + }, + "debug": + { + "info":true, + "warning":true, + "error":true + } + }, + { + "source":"https://omgubuntu.co.uk/feed", + "enabled":false, + "item": + { + "link": + { + "enabled":true, + "required":false + }, + "pubDate": + { + "enabled":true, + "required":false + }, + "title": + { + "enabled":true, + "required":false + }, + "description": + { + "enabled":true, + "required":false + }, + "content":{ + "enabled":false, + "required":false + } + }, + "debug": + { + "info":true, + "warning":true, + "error":true + } + } + ] + } +} \ No newline at end of file diff --git a/example/config.json b/example/config.json deleted file mode 100644 index 35dd993..0000000 --- a/example/config.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "feed": - [ - { - "source":"https://www.omglinux.com/feed", - "target":"server/127.0.0.1/public/omglinux/feed.gmi", - "item": - { - "template":"=> {link} {title}{nl}{nl}{description}", - "limit":20 - } - }, - { - "source":"https://omgubuntu.co.uk/feed", - "target":"server/127.0.0.1/public/omgubuntu/feed.gmi", - "item": - { - "template":"=> {link} {title}{nl}{nl}{description}", - "limit":20 - } - } - ] -} \ No newline at end of file diff --git a/src/Model/Database.php b/src/Model/Database.php new file mode 100644 index 0000000..e94916d --- /dev/null +++ b/src/Model/Database.php @@ -0,0 +1,171 @@ +_database = new \PDO( + sprintf( + 'sqlite:%s', + $database + ), + $username, + $password + ); + + $this->_database->setAttribute( + \PDO::ATTR_ERRMODE, + \PDO::ERRMODE_EXCEPTION + ); + + $this->_database->setAttribute( + \PDO::ATTR_DEFAULT_FETCH_MODE, + \PDO::FETCH_OBJ + ); + + $this->_database->query(' + CREATE TABLE IF NOT EXISTS "channel" + ( + "id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + "time" INTEGER NOT NULL, + "source" TEXT NOT NULL, + "link" TEXT, + "title" TEXT, + "description" TEXT + ) + '); + + $this->_database->query(' + CREATE TABLE IF NOT EXISTS "channelItem" + ( + "id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + "channelId" INTEGER NOT NULL, + "time" INTEGER NOT NULL, + "pubTime" INTEGER, + "guid" TEXT NOT NULL, + "link" TEXT, + "title" TEXT, + "description" TEXT, + "content" TEXT + ) + '); + } + + public function getChannelIdBySource( + string $source + ): ?int + { + $query = $this->_database->prepare( + 'SELECT `id` FROM `channel` WHERE `source` LIKE :source LIMIT 1' + ); + + $query->execute( + [ + ':source' => $source + ] + ); + + if ($result = $query->fetch()) + { + return $result->id; + } + + return null; + } + + public function addChannel( + string $source, + ?string $link, + ?string $title, + ?string $description, + ?int $time = null + ): ?int + { + $query = $this->_database->prepare( + 'INSERT INTO `channel` (`source`, `link`, `title`, `description`, `time`) + VALUES (:source, :link, :title, :description, :time)' + ); + + $query->execute( + [ + ':source' => $source, + ':link' => $link, + ':title' => $title, + ':description' => $description, + ':time' => $time ? $time : time() + ] + ); + + if ($id = $this->_database->lastInsertId()) + { + return (int) $id; + } + + return null; + } + + public function isChannelItemExist( + int $channelId, + string $guid + ): bool + { + $query = $this->_database->prepare( + 'SELECT NULL FROM `channelItem` WHERE `channelId` = :channelId AND `guid` LIKE :guid LIMIT 1' + ); + + $query->execute( + [ + ':channelId' => $channelId, + ':guid' => $guid + ] + ); + + return (bool) $query->fetch(); + } + + public function addChannelItem( + int $channelId, + string $guid, + ?string $link, + ?string $title, + ?string $description, + ?string $content, + ?int $pubTime, + ?int $time = null + ): ?int + { + $query = $this->_database->prepare( + 'INSERT INTO `channelItem` (`channelId`, `guid`, `link`, `title`, `description`, `content`, `pubTime`, `time`) + VALUES (:channelId, :guid, :link, :title, :description, :content, :pubTime, :time)' + ); + + $query->execute( + [ + ':channelId' => $channelId, + ':guid' => $guid, + ':link' => $link, + ':title' => $title, + ':description' => $description, + ':content' => $content, + ':pubTime' => $pubTime, + ':time' => $time ? $time : time() + ] + ); + + if ($id = $this->_database->lastInsertId()) + { + return (int) $id; + } + + return null; + } +} \ No newline at end of file diff --git a/src/Model/Filter.php b/src/Model/Filter.php new file mode 100644 index 0000000..b56d167 --- /dev/null +++ b/src/Model/Filter.php @@ -0,0 +1,69 @@ +text( + $value + ) + ) + ); + } + + public static function description( + string $value + ): string + { + return $this->text( + $value + ); + } + + public static function text( + string $value + ): string + { + return trim( + preg_replace( + [ + '/[\n\r]{2,}/', + '/[\s]{2,}/', + ], + [ + PHP_EOL, + ' ' + ], + strip_tags( + html_entity_decode( + $value + ) + ) + ) + ); + } +} \ No newline at end of file diff --git a/src/crawler.php b/src/crawler.php index 5222575..d65ec52 100644 --- a/src/crawler.php +++ b/src/crawler.php @@ -7,126 +7,354 @@ $semaphore = sem_get( ), 1 ); -if (false === sem_acquire($semaphore, true)) -{ - exit; -} +if (false === sem_acquire($semaphore, true)) exit; + +// Load dependencies +require_once __DIR__ . + DIRECTORY_SEPARATOR . '..'. + DIRECTORY_SEPARATOR . 'vendor' . + DIRECTORY_SEPARATOR . 'autoload.php'; + +// Init profile argument +if (empty($argv[1])) throw new \Exception(); // Init config $config = json_decode( file_get_contents( - __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'config.json' + str_starts_with( + $argv[1], + DIRECTORY_SEPARATOR + ) ? $argv[1] // absolute + : __DIR__ . // relative + DIRECTORY_SEPARATOR . '..'. + DIRECTORY_SEPARATOR . 'config'. + DIRECTORY_SEPARATOR . $argv[1] ) +); if (!$config) throw new \Exception(); + +// Init database +$database = new \Yggverse\Pulsar\Model\Database( + str_starts_with( + $config->database->location, + DIRECTORY_SEPARATOR + ) ? $config->database->location + : __DIR__ . + DIRECTORY_SEPARATOR . '..'. + DIRECTORY_SEPARATOR . 'config'. + DIRECTORY_SEPARATOR . $config->database->location, + $config->database->username, + $config->database->password ); -// Update feeds -foreach ($config->feed as $feed) +// Begin channels crawl +foreach ($config->crawler->channel as $channel) { - // Init feed location - $filename = str_starts_with( - $feed->target, - DIRECTORY_SEPARATOR - ) ? $feed->target : __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . $feed->target; - - // Init destination storage - @mkdir( - dirname( - $filename - ), - 0755, - true - ); - - // Get feed data - if (!$channel = simplexml_load_file($feed->source)->channel) + // Check channel enabled + if (!$channel->enabled) { + if ($channel->debug->info) + { + printf( + _('[%s] [info] skip disabled channel "%s"') . PHP_EOL, + date('c'), + $channel->source + ) . PHP_EOL; + } + continue; } - // Update title - if (!empty($channel->title)) + // Get channel data + if (!$remoteChannel = simplexml_load_file($channel->source)->channel) { - $title = trim( - strip_tags( - html_entity_decode( - $channel->title - ) - ) - ); - } - - else - { - $title = parse_url( - $feed->source, - PHP_URL_HOST - ); - } - - file_put_contents( - $filename, - sprintf( - '# %s', - $title - ) . PHP_EOL - ); - - // Append description - if (!empty($channel->description)) - { - file_put_contents( - $filename, - PHP_EOL . trim( - strip_tags( - html_entity_decode( - $channel->description - ) - ) - ) . PHP_EOL, - FILE_APPEND | LOCK_EX - ); - } - - // Append items - $i = 1; foreach ($channel->item as $item) - { - // Apply items limit - if ($i > $feed->item->limit) + if ($channel->debug->warning) { - break; + printf( + _('[%s] [warning] channel "%s" not accessible') . PHP_EOL, + date('c'), + $channel->source + ) . PHP_EOL; } - // Format item - file_put_contents( - $filename, - PHP_EOL . trim( - preg_replace( - '/[\s]{3,}/ui', - PHP_EOL . PHP_EOL, - str_replace( - [ - '{nl}', - '{link}', - '{guid}', - '{pubDate}', - '{title}', - '{description}' - ], - [ - PHP_EOL, - !empty($item->link) ? trim($item->link) : '', - !empty($item->guid) ? trim($item->guid) : '', - !empty($item->pubDate) ? trim($item->pubDate) : '', - !empty($item->title) ? trim(strip_tags(html_entity_decode($item->title))) : '', - !empty($item->description) ? trim(strip_tags(html_entity_decode($item->description))) : '' - ], - $feed->item->template - ) . PHP_EOL - ) - ) . PHP_EOL, - FILE_APPEND | LOCK_EX + continue; + } + + // Init channel + if (!$channelId = $database->getChannelIdBySource($channel->source)) + { + // Create new one if not exists + $channelId = $database->addChannel( + $channel->source, + isset($remoteChannel->link) ? (string) $remoteChannel->link : null, + isset($remoteChannel->title) ? (string) $remoteChannel->title : null, + isset($remoteChannel->description) ? (string) $remoteChannel->description : null ); - $i++; + if ($channel->debug->info) + { + printf( + _('[%s] [info] channel "%s" registered as #%d') . PHP_EOL, + date('c'), + $channel->source, + $channelId + ) . PHP_EOL; + } + } + + // Process items + if (!empty($remoteChannel->item)) + { + foreach ($remoteChannel->item as $remoteChannelItem) + { + // Prepare link + $link = null; + + if ($channel->item->link->enabled) + { + if (isset($remoteChannelItem->link)) + { + $link = (string) $remoteChannelItem->link; + } + + else + { + if ($channel->debug->info) + { + printf( + _('[%s] [info] item link enabled but not defined in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + } + + if ($channel->item->link->required && !$link) + { + if ($channel->debug->warning) + { + printf( + _('[%s] [warning] could not get item link for channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + + continue; + } + } + + // Prepare guid or define it from link + $guid = null; + + if (isset($remoteChannelItem->guid)) + { + $guid = (string) $remoteChannelItem->guid; + } + + else + { + $guid = $link; + + if ($channel->debug->warning) + { + printf( + _('[%s] [warning] item guid defined as link in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + } + + // Prepare title + $title = null; + + if ($channel->item->title->enabled) + { + if (isset($remoteChannelItem->title)) + { + $title = (string) $remoteChannelItem->title; + } + + else + { + if ($channel->debug->info) + { + printf( + _('[%s] [info] item title enabled but not defined in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + } + + if ($channel->item->title->required && !$title) + { + if ($channel->debug->warning) + { + printf( + _('[%s] [warning] could not get item title in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + + continue; + } + } + + // Prepare description + $description = null; + + if ($channel->item->description->enabled) + { + if (isset($remoteChannelItem->description)) + { + $description = (string) $remoteChannelItem->description; + } + + else + { + if ($channel->debug->info) + { + printf( + _('[%s] [info] item description enabled but not defined in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + } + + if ($channel->item->description->required && !$description) + { + if ($channel->debug->warning) + { + printf( + _('[%s] [warning] could not get item description in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + + continue; + } + } + + // Prepare content + $content = null; + + if ($channel->item->content->enabled) + { + if ($_content = $remoteChannelItem->children('content', true)) + { + if (isset($_content->encoded)) + { + $content = (string) $_content->encoded; + } + } + + if (!$content && $channel->debug->info) + { + printf( + _('[%s] [info] item content enabled but not defined in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + + if ($channel->item->content->required && !$content) + { + if ($channel->debug->warning) + { + printf( + _('[%s] [warning] could not get item content in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + + continue; + } + } + + // Prepare pubDate + $pubTime = null; + + if ($channel->item->pubDate->enabled) + { + if (isset($remoteChannelItem->pubDate)) + { + if ($_pubTime = strtotime((string) $remoteChannelItem->pubDate)) + { + $pubTime = $_pubTime; + } + + else + { + if ($channel->debug->warning) + { + printf( + _('[%s] [info] could not convert item pubDate to pubTime in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + } + } + + else + { + if ($channel->debug->info) + { + printf( + _('[%s] [info] item pubDate enabled but not defined in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + } + + if ($channel->item->pubDate->required && !$pubTime) + { + if ($channel->debug->warning) + { + printf( + _('[%s] [warning] could not get item pubDate in channel #%d') . PHP_EOL, + date('c'), + $channelId + ) . PHP_EOL; + } + + continue; + } + } + + // Check item not registered yet + if (!$database->isChannelItemExist($channelId, $guid)) + { + // Create new one if not exists + $channelItemId = $database->addChannelItem( + $channelId, + $guid, + $link, + $title, + $description, + $content, + $pubTime + ); + + if ($channelItemId) + { + if ($channel->debug->info) + { + printf( + _('[%s] [info] registered new item #%d for channel #%d') . PHP_EOL, + date('c'), + $channelItemId, + $channelId + ) . PHP_EOL; + } + } + } + } } } \ No newline at end of file