init gemini protocol implementation

2026-03-31 17:55:35 +00:00 · 2024-04-03 17:07:28 +03:00 · 2024-04-03 17:07:28 +03:00 · 1f96ca8a2c
commit 1f96ca8a2c
parent 8418a56617
11 changed files with 204 additions and 2422 deletions
--- a/src/cli/document/crawl.php
+++ b/src/cli/document/crawl.php
@ -6,7 +6,7 @@ $microtime = microtime(true);
 // Load dependencies
 require_once __DIR__ . '/../../../vendor/autoload.php';

-// Define helpers
+// Define helpers @TODO move to separated library (yo-php)
 function getLastSnapTime(array $files): int
 {
    $time = [];
@ -37,6 +37,40 @@ function getLastSnapTime(array $files): int
    return 0;
 }

+function relative2absolute(
+    string $source, // current document url to grab the base
+    string $target, // relative or absolute link
+    ?string &$scheme = null,
+    ?string &$host = null,
+    ?int &$port = null
+) {
+    if (!parse_url($target, PHP_URL_HOST))
+    {
+        $scheme = parse_url($base, PHP_URL_SCHEME);
+        $host   = parse_url($base, PHP_URL_HOST);
+        $port   = parse_url($base, PHP_URL_PORT);
+
+        return $scheme . '://' . $host . ($port ? ':' . $port : null) .
+        '/' .
+        trim(
+            ltrim(
+                str_replace(
+                    [
+                        './',
+                        '../'
+                    ],
+                    '',
+                    $target
+                ),
+                '/'
+            ),
+            '.'
+        );
+    }
+
+    return $target;
+}
+
 // Init config
 $config = json_decode(
    file_get_contents(
@ -182,16 +216,16 @@ foreach($index->search('')

    $data =
    [
-        'url'         => $document->get('url'),
-        'title'       => $document->get('title'),
-        'description' => $document->get('description'),
-        'keywords'    => $document->get('keywords'),
-        'code'        => $document->get('code'),
-        'size'        => $document->get('size'),
-        'mime'        => $document->get('mime'),
-        'rank'        => $document->get('rank'),
-        'time'        => $time,
-        'index'       => 0
+        'url'   => $document->get('url'),
+        'h1'    => $document->get('h1'),
+        'h2'    => $document->get('h2'),
+        'h3'    => $document->get('h3'),
+        'code'  => $document->get('code'),
+        'size'  => $document->get('size'),
+        'meta'  => $document->get('meta'),
+        'rank'  => $document->get('rank'),
+        'time'  => $time,
+        'index' => 0
    ];

    // Debug target
@ -205,114 +239,50 @@ foreach($index->search('')
        );
    }

-    // Update index time anyway and set reset code to 404
+    // Update index time anyway and set reset code to 51
    $index->updateDocument(
        [
            'time'  => time(),
-            'code'  => 200,
+            'code'  => 20,
            'index' => 0
        ],
        $document->getId()
    );

    // Request remote URL
-    $request = curl_init(
+    $request = new \Yggverse\Gemini\Client\Request(
        $document->get('url')
    );

-    // Drop URL with long response
-    curl_setopt(
-        $request,
-        CURLOPT_CONNECTTIMEOUT,
-        $config->cli->document->crawl->curl->connection->timeout
-    );
-
-    curl_setopt(
-        $request,
-        CURLOPT_TIMEOUT,
-        $config->cli->document->crawl->curl->connection->timeout
-    );
-
-    // Prevent huge content download e.g. media streams URL
-    curl_setopt(
-        $request,
-        CURLOPT_RETURNTRANSFER,
-        true
-    );
-
-    curl_setopt(
-        $request,
-        CURLOPT_NOPROGRESS,
-        false
-    );
-
-    curl_setopt(
-        $request,
-        CURLOPT_PROGRESSFUNCTION,
-        function(
-            $download,
-            $downloaded,
-            $upload,
-            $uploaded
-        ) {
-            global $config;
-
-            global $index;
-            global $document;
-
-            $index->updateDocument(
-                [
-                    'time'  => time(),
-                    'code'  => 200,
-                    'index' => 0
-                ],
-                $document->getId()
-            );
-
-            return $downloaded > $config->cli->document->crawl->curl->download->size->max ? 1 : 0;
-        }
+    $response = new \Yggverse\Gemini\Client\Response(
+        $request->getResponse(
+            $config->cli->document->crawl->connection->timeout,
+            $config->cli->document->crawl->connection->length,
+            $config->cli->document->crawl->connection->chunk,
+            $length
+        )
    );

    // Begin request
-    if ($response = curl_exec($request))
+    if ($code = $request->getCode()) // @TODO process redirects
    {
-        // Update HTTP code or skip on empty
-        if ($code = curl_getinfo($request, CURLINFO_HTTP_CODE))
-        {
-            // Delete deprecated document from index as HTTP code still not 200
-            /*
-            if ($code != 200 && !empty($data['code']) && $data['code'] != 200)
-            {
-                $index->deleteDocument(
-                    $document->getId()
-                );
-
-                continue;
-            }
-            */
-
-            $data['code'] = $code;
-
-        } else continue;
+        // Update status code
+        $data['code'] = $code;

        // Update size or skip on empty
-        if ($size = curl_getinfo($request, CURLINFO_SIZE_DOWNLOAD))
+        if ($length)
        {
-            $size = round( // float
-                $size
-            );
-
-            $data['size'] = $size;
+            $data['size'] = $length;

        } else continue;

-        // Update MIME type or skip on empty
-        if ($type = curl_getinfo($request, CURLINFO_CONTENT_TYPE))
+        // Update meta or skip on empty
+        if ($meta = $response->getMeta())
        {
-            $data['mime'] = $type;
+            $data['meta'] = $meta;

            // On document charset specified
-            if (preg_match('/charset=([^\s;]+)/i', $type, $charset))
+            if (preg_match('/charset=([^\s;]+)/i', $meta, $charset))
            {
                if (!empty($charset[1]))
                {
@ -322,10 +292,12 @@ foreach($index->search('')
                        if (strtolower($charset[1]) == strtolower($encoding))
                        {
                            // Convert response to UTF-8
-                            $response = mb_convert_encoding(
-                                $response,
-                                'UTF-8',
-                                $charset[1]
+                            $response->setBody(
+                                mb_convert_encoding(
+                                    $response->getBody(),
+                                    'UTF-8',
+                                    $charset[1]
+                                )
                            );

                            break;
@ -336,241 +308,102 @@ foreach($index->search('')

        } else continue;

-        // DOM crawler
-        if (
-            false !== stripos($type, 'text/html')
-            ||
-            false !== stripos($type, 'text/xhtml')
-            ||
-            false !== stripos($type, 'application/xhtml')
-        ) {
-            $crawler = new Symfony\Component\DomCrawler\Crawler();
-            $crawler->addHtmlContent(
-                $response
+        // Gemtext parser
+        if (false !== stripos($response->getMeta(), 'text/gemini'))
+        {
+            $body = new \Yggverse\Gemini\Client\Gemtext\Body(
+                $response->getBody()
            );

-            // Get title
-            foreach ($crawler->filter('head > title')->each(function($node) {
-
-                return $node->text();
-
-            }) as $value)
+            // Get H1
+            $h1 = [];
+            foreach ($body->getH1() as $value)
            {
-                if (!empty($value))
-                {
-                    $data['title'] = trim(
-                        strip_tags(
-                            html_entity_decode(
-                                $value
-                            )
-                        )
-                    );
-                }
+                $h1[] = $value;
            }

-            // Get description
-            foreach ($crawler->filter('head > meta[name="description"]')->each(function($node) {
-
-                return $node->attr('content');
-
-            }) as $value)
-            {
-                if (!empty($value))
-                {
-                    $data['description'] = trim(
-                        strip_tags(
-                            html_entity_decode(
-                                $value
-                            )
-                        )
-                    );
-                }
-            }
-
-            // Get keywords
-            $keywords = [];
-
-            // Extract from meta tag
-            foreach ($crawler->filter('head > meta[name="keywords"]')->each(function($node) {
-
-                return $node->attr('content');
-
-            }) as $value)
-            {
-                if (!empty($value))
-                {
-                    foreach ((array) explode(
-                        ',',
-                        mb_strtolower(
-                            strip_tags(
-                                html_entity_decode(
-                                    $value
-                                )
-                            )
-                        )
-                    ) as $keyword)
-                    {
-                        // Remove extra spaces
-                        $keyword = trim(
-                            $keyword
-                        );
-
-                        // Skip short words
-                        if (mb_strlen($keyword) > 2)
-                        {
-                            $keywords[] = $keyword;
-                        }
-                    }
-                }
-            }
-
-            // Get keywords from headers
-            /* Disable keywords collection from headers as body index enabled
-
-            foreach ($crawler->filter('h1,h2,h3,h4,h5,h6')->each(function($node) {
-
-                return $node->text();
-
-            }) as $value)
-            {
-                if (!empty($value))
-                {
-                    foreach ((array) explode(
-                        ',',
-                        mb_strtolower(
-                            strip_tags(
-                                html_entity_decode(
-                                    $value
-                                )
-                            )
-                        )
-                    ) as $keyword)
-                    {
-                        // Remove extra spaces
-                        $keyword = trim(
-                            $keyword
-                        );
-
-                        // Skip short words
-                        if (mb_strlen($keyword) > 2)
-                        {
-                            $keywords[] = $keyword;
-                        }
-                    }
-                }
-            }
-            */
-
-            // Keep keywords unique
-            $keywords = array_unique(
-                $keywords
+            $data['h1'] = implode(
+                ',',
+                array_unique(
+                    $h1
+                )
            );

-            // Update previous keywords when new value exists
-            if ($keywords)
+            // Get H1
+            $h2 = [];
+            foreach ($body->getH2() as $value)
            {
-                $data['keywords'] = implode(',', $keywords);
+                $h2[] = $value;
            }

+            $data['h2'] = implode(
+                ',',
+                array_unique(
+                    $h2
+                )
+            );
+
+            // Get H3
+            $h3 = [];
+            foreach ($body->getH3() as $value)
+            {
+                $h3[] = $value;
+            }
+
+            $data['h3'] = implode(
+                ',',
+                array_unique(
+                    $h3
+                )
+            );
+
            // Save document body text to index
-            foreach ($crawler->filter('html > body')->each(function($node) {
+            $data['body'] = trim(
+                preg_replace(
+                    '/[\s]{2,}/', // strip extra separators
+                    ' ',
+                    $response->getBody()
+                )
+            );

-                return $node->html();
-
-            }) as $value)
-            {
-                if (!empty($value))
-                {
-                    $data['body'] = trim(
-                        preg_replace(
-                            '/[\s]{2,}/', // strip extra separators
-                            ' ',
-                            strip_tags(
-                                str_replace( // make text separators before strip any closing tag, new line, etc
-                                    [
-                                        '<',
-                                        '>',
-                                        PHP_EOL,
-                                    ],
-                                    [
-                                        ' <',
-                                        '> ',
-                                        PHP_EOL . ' ',
-                                    ],
-                                    preg_replace(
-                                        [
-                                            '/<script([^>]*)>([\s\S]*?)<\/script>/i', // strip js content
-                                            '/<style([^>]*)>([\s\S]*?)<\/style>/i', // strip css content
-                                            '/<pre([^>]*)>([\s\S]*?)<\/pre>/i', // strip code content
-                                            '/<code([^>]*)>([\s\S]*?)<\/code>/i',
-                                        ],
-                                        '',
-                                        html_entity_decode(
-                                            $value
-                                        )
-                                    )
-                                )
-                            )
-                        )
-                    );
-                }
-            }
-
-            // Crawl documents
+            // Crawl links
            $documents = [];

-            $scheme = parse_url($document->get('url'), PHP_URL_SCHEME);
-            $host   = parse_url($document->get('url'), PHP_URL_HOST);
-            $port   = parse_url($document->get('url'), PHP_URL_PORT);
-
-            foreach ($config->cli->document->crawl->selector as $selector => $settings)
+            foreach ($body->getLinks() as $line)
            {
-                foreach ($crawler->filter($selector)->each(function($node) {
+                $link = new \Yggverse\Gemini\Gemtext\Link(
+                    $line
+                );

-                    return $node;
+                if ($url = $link->getAddress())
+                {
+                    //Make relative links absolute
+                    $url = relative2absolute(
+                        $document->get('url'),
+                        $url,
+                        $scheme,
+                        $host,
+                        $port,
+                    );

-                }) as $value) {
-
-                    if ($url = $value->attr($settings->attribute))
+                    // Regex rules
+                    if (!preg_match($config->cli->document->crawl->url->regex, $url))
                    {
-                        //Make relative links absolute
-                        if (!parse_url($url, PHP_URL_HOST))
-                        {
-                            $url =  $scheme . '://' . $host . ($port ? ':' . $port : null) .
-                                    '/' .
-                                    trim(
-                                        ltrim(
-                                            str_replace(
-                                                [
-                                                    './',
-                                                    '../'
-                                                ],
-                                                '',
-                                                $url
-                                            ),
-                                            '/'
-                                        ),
-                                        '.'
-                                    );
-                        }
-
-                        // Regex rules
-                        if (!preg_match($settings->regex, $url))
-                        {
-                            continue;
-                        }
-
-                        // External host rules
-                        if (!$settings->external && parse_url($url, PHP_URL_HOST) != $host)
-                        {
-                            continue;
-                        }
-
-                        $documents[] = $url;
+                        continue;
                    }
+
+                    // External host rules
+                    if (!$config->cli->document->crawl->url->external && parse_url($url, PHP_URL_HOST) != $host)
+                    {
+                        continue;
+                    }
+
+                    $documents[] = $url;
                }
            }

+            // @TODO find document links by protocol ($body->findLinks('gemini'))
+
            if ($documents)
            {
                foreach (array_unique($documents) as $url)
@ -578,7 +411,7 @@ foreach($index->search('')
                    // Apply stripos condition
                    $skip = false;

-                    foreach ($config->cli->document->crawl->skip->stripos->url as $condition)
+                    foreach ($config->cli->document->crawl->url->skip->stripos as $condition)
                    {
                        if (false !== stripos($url, $condition)) {

@ -597,7 +430,7 @@ foreach($index->search('')
                                date('c'),
                                $url,
                                print_r(
-                                    $config->cli->document->crawl->skip->stripos->url,
+                                    $config->cli->document->crawl->url->skip->stripos,
                                    true
                                )
                            );
@ -701,7 +534,7 @@ foreach($index->search('')
        }

        // Create snap
-        if ($config->cli->document->crawl->snap->enabled && $code === 200)
+        if ($config->cli->document->crawl->snap->enabled && $request->getCode() === 20)
        {
            try
            {
@ -734,12 +567,12 @@ foreach($index->search('')

                $snap->addFromString(
                    'DATA',
-                    $response
+                    $response->getBody()
                );

                $snap->addFromString(
-                    'MIME',
-                    $type
+                    'META',
+                    $response->getMeta()
                );

                $snap->addFromString(
@ -767,12 +600,12 @@ foreach($index->search('')
                // Copy to local storage on enabled
                if ($config->snap->storage->local->enabled)
                {
-                    // Check for mime allowed
+                    // Check for meta allowed
                    $allowed = false;

-                    foreach ($config->snap->storage->local->mime->stripos as $whitelist)
+                    foreach ($config->snap->storage->local->meta->stripos as $whitelist)
                    {
-                        if (false !== stripos($type, $whitelist))
+                        if (false !== stripos($response->getMeta(), $whitelist))
                        {
                            $allowed = true;
                            break;
@ -904,12 +737,12 @@ foreach($index->search('')
                        continue;
                    }

-                    // Check for mime allowed
+                    // Check for meta allowed
                    $allowed = false;

-                    foreach ($ftp->mime->stripos as $whitelist)
+                    foreach ($ftp->meta->stripos as $whitelist)
                    {
-                        if (false !== stripos($type, $whitelist))
+                        if (false !== stripos($response->getMeta(), $whitelist))
                        {
                            $allowed = true;
                            break;