separate Pollable and Transactional features, separate table members, use single-connection transactions method in the crawler and llm crates, minor crawler optimizations such as disconnect from db server on each queue iteration complete

This commit is contained in:
yggverse 2026-01-09 22:35:06 +02:00
parent 221b43e4cf
commit f48e256fad
11 changed files with 438 additions and 409 deletions

View file

@ -9,5 +9,10 @@ keywords = ["rssto", "database", "mysql", "library", "driver", "api"]
# categories = []
repository = "https://github.com/YGGverse/rssto"
[features]
default = ["pollable"]
pollable = []
transactional = []
[dependencies]
mysql = "26.0.1"
mysql = "26.0.1"

View file

@ -1,333 +1,13 @@
use mysql::{
Error, Pool,
prelude::{FromRow, Queryable},
};
#[cfg(feature = "pollable")]
pub mod pollable;
pub struct Mysql {
pool: Pool,
}
pub mod table;
impl Mysql {
pub fn connect(
host: &str,
port: u16,
user: &str,
password: &str,
database: &str,
) -> Result<Self, Error> {
Ok(Self {
pool: mysql::Pool::new(
format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(),
)?,
})
}
#[cfg(feature = "transactional")]
pub mod transactional;
pub fn channels_by_url(&self, url: &str, limit: Option<usize>) -> Result<Vec<Channel>, Error> {
self.pool.get_conn()?.exec(
format!(
"SELECT `channel_id`, `url` FROM `channel` WHERE `url` = ? LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)
),
(url,),
)
}
#[cfg(feature = "pollable")]
pub use pollable::Pollable;
pub fn insert_channel(&self, url: &str) -> Result<u64, Error> {
let mut c = self.pool.get_conn()?;
c.exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?;
Ok(c.last_insert_id())
}
pub fn channel_item(&self, channel_item_id: u64) -> Result<Option<ChannelItem>, Error> {
self.pool.get_conn()?.exec_first(
"SELECT `channel_item_id`,
`channel_id`,
`pub_date`,
`guid`,
`link`,
`title`,
`description` FROM `channel_item` WHERE `channel_item_id` = ?",
(channel_item_id,),
)
}
pub fn channel_items_by_channel_id_guid(
&self,
channel_id: u64,
guid: &str,
limit: Option<usize>,
) -> Result<Vec<ChannelItem>, Error> {
self.pool.get_conn()?.exec(
format!(
"SELECT `channel_item_id`,
`channel_id`,
`pub_date`,
`guid`,
`link`,
`title`,
`description` FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ? LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)
),
(channel_id, guid),
)
}
pub fn insert_channel_item(
&self,
channel_id: u64,
pub_date: i64,
guid: &str,
link: &str,
title: Option<&str>,
description: Option<&str>,
) -> Result<u64, Error> {
let mut c = self.pool.get_conn()?;
c.exec_drop(
"INSERT INTO `channel_item` SET `channel_id` = ?,
`pub_date` = ?,
`guid` = ?,
`link` = ?,
`title` = ?,
`description` = ?",
(channel_id, pub_date, guid, link, title, description),
)?;
Ok(c.last_insert_id())
}
pub fn content(&self, content_id: u64) -> Result<Option<Content>, Error> {
self.pool.get_conn()?.exec_first(
"SELECT `content_id`,
`channel_item_id`,
`provider_id`,
`title`,
`description` FROM `content` WHERE `content_id` = ?",
(content_id,),
)
}
pub fn contents_total_by_provider_id(
&self,
provider_id: Option<u64>,
keyword: Option<&str>,
) -> Result<usize, Error> {
let total: Option<usize> = self.pool.get_conn()?.exec_first(
"SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?",
(provider_id, like(keyword)),
)?;
Ok(total.unwrap_or(0))
}
pub fn contents_by_provider_id(
&self,
provider_id: Option<u64>,
keyword: Option<&str>,
sort: Sort,
limit: Option<usize>,
) -> Result<Vec<Content>, Error> {
self.pool.get_conn()?.exec(format!(
"SELECT `content_id`,
`channel_item_id`,
`provider_id`,
`title`,
`description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id, like(keyword), ))
}
/// Get subjects for `rssto-llm` queue
pub fn contents_queue_for_provider_id(
&self,
provider_id: u64,
sort: Sort,
limit: Option<usize>,
) -> Result<Vec<Content>, Error> {
self.pool.get_conn()?.exec(
format!(
"SELECT `c1`.`content_id`,
`c1`.`channel_item_id`,
`c1`.`provider_id`,
`c1`.`title`,
`c1`.`description`
FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS (
SELECT NULL FROM `content` AS `c2` WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id` AND `c2`.`provider_id` = ? LIMIT 1
) ORDER BY `c1`.`content_id` {sort} LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id,),
)
}
pub fn contents_by_channel_item_id_provider_id(
&self,
channel_item_id: u64,
provider_id: Option<u64>,
limit: Option<usize>,
) -> Result<Vec<Content>, Error> {
self.pool.get_conn()?.exec(
format!(
"SELECT `content_id`,
`channel_item_id`,
`provider_id`,
`title`,
`description` FROM `content`
WHERE `channel_item_id` = ? AND `provider_id` <=> ? LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)
),
(channel_item_id, provider_id),
)
}
pub fn insert_content(
&self,
channel_item_id: u64,
provider_id: Option<u64>,
title: &str,
description: &str,
) -> Result<u64, Error> {
let mut c = self.pool.get_conn()?;
c.exec_drop(
"INSERT INTO `content` SET `channel_item_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_item_id, provider_id, title, description),
)?;
Ok(c.last_insert_id())
}
pub fn content_image(&self, content_image_id: u64) -> Result<Option<ContentImage>, Error> {
self.pool.get_conn()?.exec_first(
"SELECT `content_image_id`,
`content_id`,
`image_id`,
`data`,
`source` FROM `content_image`
JOIN `image` ON (`image`.`image_id` = `content_image`.`image_id`)
WHERE `content_image_id` = ? LIMIT 1",
(content_image_id,),
)
}
pub fn insert_content_image(&self, content_id: u64, image_id: u64) -> Result<u64, Error> {
let mut c = self.pool.get_conn()?;
c.exec_drop(
"INSERT INTO `content_image` SET `content_id` = ?, `image_id` = ?",
(content_id, image_id),
)?;
Ok(c.last_insert_id())
}
pub fn image_by_source(&self, source: &str) -> Result<Option<Image>, Error> {
self.pool.get_conn()?.exec_first(
"SELECT `image_id`,
`source`,
`data` FROM `image` WHERE `source` = ? LIMIT 1",
(source,),
)
}
pub fn images(&self, limit: Option<usize>) -> Result<Vec<Image>, Error> {
self.pool.get_conn()?.query(format!(
"SELECT `image_id`, `source`, `data` FROM `image` LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)
))
}
pub fn insert_image(&self, source: &str, data: &[u8]) -> Result<u64, Error> {
let mut c = self.pool.get_conn()?;
c.exec_drop(
"INSERT INTO `image` SET `source` = ?, `data` = ?",
(source, data),
)?;
Ok(c.last_insert_id())
}
pub fn provider_by_name(&self, name: &str) -> Result<Option<Provider>, Error> {
self.pool.get_conn()?.exec_first(
"SELECT `provider_id`,
`name`
FROM `provider` WHERE `name` = ?",
(name,),
)
}
pub fn insert_provider(&self, name: &str) -> Result<u64, Error> {
let mut c = self.pool.get_conn()?;
c.exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?;
Ok(c.last_insert_id())
}
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Channel {
pub channel_id: u64,
pub url: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItem {
pub channel_item_id: u64,
pub channel_id: u64,
pub pub_date: i64,
pub guid: String,
pub link: String,
pub title: Option<String>,
pub description: Option<String>,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Content {
pub content_id: u64,
pub channel_item_id: u64,
/// None if the original `title` and `description` values
/// parsed from the channel item on crawl
pub provider_id: Option<u64>,
pub title: String,
pub description: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Provider {
pub provider_id: u64,
pub name: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Image {
pub image_id: u64,
pub source: String,
pub data: Vec<u8>,
}
/// Includes joined `image` table members
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ContentImage {
pub content_image_id: u64,
pub content_id: u64,
pub image_id: u64,
// Image members (JOIN)
pub data: Vec<u8>,
pub source: String,
}
pub enum Sort {
Asc,
Desc,
}
impl std::fmt::Display for Sort {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::Asc => write!(f, "ASC"),
Self::Desc => write!(f, "DESC"),
}
}
}
/// Shared search logic
fn like(value: Option<&str>) -> String {
value.map_or("%".into(), |k| format!("{k}%"))
}
const DEFAULT_LIMIT: usize = 100;
#[cfg(feature = "transactional")]
pub use transactional::Transactional;

View file

@ -0,0 +1,114 @@
pub mod sort;
pub use sort::Sort;
use crate::table::*;
use mysql::{Error, Pool, prelude::Queryable};
/// Safe, read-only operations used in client apps like `rssto-http`
pub struct Pollable {
pool: Pool,
}
impl Pollable {
pub fn connect(
host: &str,
port: u16,
user: &str,
password: &str,
database: &str,
) -> Result<Self, Error> {
Ok(Self {
pool: mysql::Pool::new(
format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str(),
)?,
})
}
pub fn channel_item(&self, channel_item_id: u64) -> Result<Option<ChannelItem>, Error> {
self.pool.get_conn()?.exec_first(
"SELECT `channel_item_id`,
`channel_id`,
`pub_date`,
`guid`,
`link`,
`title`,
`description` FROM `channel_item` WHERE `channel_item_id` = ?",
(channel_item_id,),
)
}
pub fn content(&self, content_id: u64) -> Result<Option<Content>, Error> {
self.pool.get_conn()?.exec_first(
"SELECT `content_id`,
`channel_item_id`,
`provider_id`,
`title`,
`description` FROM `content` WHERE `content_id` = ?",
(content_id,),
)
}
pub fn contents_total_by_provider_id(
&self,
provider_id: Option<u64>,
keyword: Option<&str>,
) -> Result<usize, Error> {
let total: Option<usize> = self.pool.get_conn()?.exec_first(
"SELECT COUNT(*) FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ?",
(provider_id, like(keyword)),
)?;
Ok(total.unwrap_or(0))
}
pub fn contents_by_provider_id(
&self,
provider_id: Option<u64>,
keyword: Option<&str>,
sort: Sort,
limit: Option<usize>,
) -> Result<Vec<Content>, Error> {
self.pool.get_conn()?.exec(format!(
"SELECT `content_id`,
`channel_item_id`,
`provider_id`,
`title`,
`description` FROM `content` WHERE `provider_id` <=> ? AND `title` LIKE ? ORDER BY `content_id` {sort} LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)
),
(provider_id, like(keyword), ))
}
pub fn content_image(&self, content_image_id: u64) -> Result<Option<ContentImage>, Error> {
self.pool.get_conn()?.exec_first(
"SELECT `content_image_id`,
`content_id`,
`image_id`,
`data`,
`source` FROM `content_image`
JOIN `image` ON (`image`.`image_id` = `content_image`.`image_id`)
WHERE `content_image_id` = ? LIMIT 1",
(content_image_id,),
)
}
pub fn images(&self, limit: Option<usize>) -> Result<Vec<Image>, Error> {
self.pool.get_conn()?.query(format!(
"SELECT `image_id`, `source`, `data` FROM `image` LIMIT {}",
limit.unwrap_or(DEFAULT_LIMIT)
))
}
pub fn insert_provider(&self, name: &str) -> Result<u64, Error> {
let mut c = self.pool.get_conn()?;
c.exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?;
Ok(c.last_insert_id())
}
}
/// Shared search logic
fn like(value: Option<&str>) -> String {
value.map_or("%".into(), |k| format!("{k}%"))
}
const DEFAULT_LIMIT: usize = 100;

View file

@ -0,0 +1,13 @@
pub enum Sort {
Asc,
Desc,
}
impl std::fmt::Display for Sort {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::Asc => write!(f, "ASC"),
Self::Desc => write!(f, "DESC"),
}
}
}

53
crates/mysql/src/table.rs Normal file
View file

@ -0,0 +1,53 @@
use mysql::prelude::FromRow;
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Channel {
pub channel_id: u64,
pub url: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ChannelItem {
pub channel_item_id: u64,
pub channel_id: u64,
pub pub_date: i64,
pub guid: String,
pub link: String,
pub title: Option<String>,
pub description: Option<String>,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Content {
pub content_id: u64,
pub channel_item_id: u64,
/// None if the original `title` and `description` values
/// parsed from the channel item on crawl
pub provider_id: Option<u64>,
pub title: String,
pub description: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Provider {
pub provider_id: u64,
pub name: String,
}
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct Image {
pub image_id: u64,
pub source: String,
pub data: Vec<u8>,
}
/// Includes joined `image` table members
#[derive(Debug, PartialEq, Eq, FromRow)]
pub struct ContentImage {
pub content_image_id: u64,
pub content_id: u64,
pub image_id: u64,
// Image members (JOIN)
pub data: Vec<u8>,
pub source: String,
}

View file

@ -0,0 +1,148 @@
use crate::table::*;
use mysql::{Error, Pool, Transaction, TxOpts, prelude::Queryable};
/// Safe, optimized read/write operations
/// mostly required by the `rssto-crawler` and `rssto-llm`
/// * all members implementation requires `commit` action
pub struct Transactional {
tx: Transaction<'static>,
}
impl Transactional {
pub fn connect(
host: &str,
port: u16,
user: &str,
password: &str,
database: &str,
) -> Result<Self, Error> {
Ok(Self {
tx: Pool::new(format!("mysql://{user}:{password}@{host}:{port}/{database}").as_str())?
.start_transaction(TxOpts::default())?,
})
}
pub fn commit(self) -> Result<(), Error> {
self.tx.commit()
}
pub fn channel_id_by_url(&mut self, url: &str) -> Result<Option<u64>, Error> {
self.tx.exec_first(
"SELECT `channel_id` FROM `channel` WHERE `url` = ? LIMIT 1",
(url,),
)
}
pub fn insert_channel(&mut self, url: &str) -> Result<u64, Error> {
self.tx
.exec_drop("INSERT INTO `channel` SET `url` = ?", (url,))?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn channel_items_total_by_channel_id_guid(
&mut self,
channel_id: u64,
guid: &str,
) -> Result<usize, Error> {
Ok(self
.tx
.exec_first(
"SELECT COUNT(*) FROM `channel_item` WHERE `channel_id` = ? AND `guid` = ?",
(channel_id, guid),
)?
.unwrap_or(0))
}
pub fn insert_channel_item(
&mut self,
channel_id: u64,
pub_date: i64,
guid: &str,
link: &str,
title: Option<&str>,
description: Option<&str>,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `channel_item` SET `channel_id` = ?,
`pub_date` = ?,
`guid` = ?,
`link` = ?,
`title` = ?,
`description` = ?",
(channel_id, pub_date, guid, link, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn contents_queue_for_provider_id(
&mut self,
provider_id: u64,
) -> Result<Vec<Content>, Error> {
self.tx.exec(
"SELECT `c1`.`content_id`,
`c1`.`channel_item_id`,
`c1`.`provider_id`,
`c1`.`title`,
`c1`.`description`
FROM `content` AS `c1` WHERE `c1`.`provider_id` IS NULL AND NOT EXISTS (
SELECT NULL FROM `content` AS `c2`
WHERE `c2`.`channel_item_id` = `c1`.`channel_item_id`
AND `c2`.`provider_id` = ? LIMIT 1
)",
(provider_id,),
)
}
pub fn insert_content(
&mut self,
channel_item_id: u64,
provider_id: Option<u64>,
title: &str,
description: &str,
) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `content` SET `channel_item_id` = ?,
`provider_id` = ?,
`title` = ?,
`description` = ?",
(channel_item_id, provider_id, title, description),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn insert_content_image(&mut self, content_id: u64, image_id: u64) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `content_image` SET `content_id` = ?, `image_id` = ?",
(content_id, image_id),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn images_total_by_source(&mut self, source: &str) -> Result<usize, Error> {
Ok(self
.tx
.exec_first("SELECT COUNT(*) FROM `image` WHERE `source` = ?", (source,))?
.unwrap_or(0))
}
pub fn insert_image(&mut self, source: &str, data: &[u8]) -> Result<u64, Error> {
self.tx.exec_drop(
"INSERT INTO `image` SET `source` = ?, `data` = ?",
(source, data),
)?;
Ok(self.tx.last_insert_id().unwrap())
}
pub fn provider_id_by_name(&mut self, name: &str) -> Result<Option<u64>, Error> {
self.tx.exec_first(
"SELECT `provider_id` FROM `provider` WHERE `name` = ?",
(name,),
)
}
pub fn insert_provider(&mut self, name: &str) -> Result<u64, Error> {
self.tx
.exec_drop("INSERT INTO `provider` SET `name` = ?", (name,))?;
Ok(self.tx.last_insert_id().unwrap())
}
}