From c0734731cb1b4500091d0cf1ba12f45ee68f2886 Mon Sep 17 00:00:00 2001 From: yggverse Date: Wed, 7 Jan 2026 19:02:00 +0200 Subject: [PATCH] disallow nullable title/description values for the content table, implement `contents_by_channel_item_id_source_id`, return last insert id for `insert_content`, fix `content_id` data type, implement initial content version save on crawl --- crates/crawler/src/main.rs | 52 ++++++++++++++++++++++++++++---------- crates/mysql/src/lib.rs | 26 ++++++++++++++++--- 2 files changed, 60 insertions(+), 18 deletions(-) diff --git a/crates/crawler/src/main.rs b/crates/crawler/src/main.rs index 7cdfc0b..fe86502 100644 --- a/crates/crawler/src/main.rs +++ b/crates/crawler/src/main.rs @@ -139,37 +139,61 @@ fn crawl(db: &mut Mysql, channel_config: &config::Channel) -> Result<()> { )?, }; - // @TODO preload remote content + // preload remote content let title = match channel_config.content_title_selector { Some(ref selector) => match scrape(&link, selector) { - Ok(value) => value, + Ok(value) => match value { + Some(title) => title, + None => { + warn!("Could not scrape `title` selector in `{channel_url}`"); + continue; + } + }, Err(e) => { warn!("Could not update `title` selector in `{channel_url}`: `{e}`"); continue; } }, - None => None, + None => match channel_item.title { + Some(ref title) => title.clone(), + None => { + warn!( + "Could not assign `title` from channel item for content in `{channel_url}`" + ); + continue; + } + }, }; - let description = match channel_config.content_description_selector { Some(ref selector) => match scrape(&link, selector) { - Ok(value) => value, + Ok(value) => match value { + Some(description) => description, + None => { + warn!("Could not scrape `description` selector in `{channel_url}`"); + continue; + } + }, Err(e) => { warn!("Could not update `description` selector in `{channel_url}`: `{e}`"); continue; } }, - None => None, + None => match channel_item.description { + Some(ref description) => description.clone(), + None => { + warn!( + "Could not assign `description` from channel item for content in `{channel_url}`" + ); + continue; + } + }, }; - - if title.is_none() && description.is_none() { - continue; - } - - // @TODO insert content record - - println!("{:?}", description) + assert!( + db.contents_by_channel_item_id_source_id(channel_item_id, None, Some(1))? + .is_empty() + ); + let _content_id = db.insert_content(channel_item_id, None, title, description)?; } Ok(()) } diff --git a/crates/mysql/src/lib.rs b/crates/mysql/src/lib.rs index 92ffb27..4219d50 100644 --- a/crates/mysql/src/lib.rs +++ b/crates/mysql/src/lib.rs @@ -93,17 +93,34 @@ impl Mysql { Ok(self.connection.last_insert_id()) } + pub fn contents_by_channel_item_id_source_id( + &mut self, + channel_item_id: u64, + source_id: Option, + limit: Option, + ) -> Result, Error> { + self.connection.exec_map( + format!( + "SELECT `content_id`, `channel_item_id`, `source_id`, `title`, `description` FROM `content` WHERE `channel_item_id` = ? AND `source_id` = ? LIMIT {}", + limit.unwrap_or(DEFAULT_LIMIT) + ), + (channel_item_id, source_id), + |(content_id, channel_item_id,source_id, title, description)| Content { content_id, channel_item_id, source_id, title, description }, + ) + } + pub fn insert_content( &mut self, channel_item_id: u64, source_id: Option, - title: &str, - description: &str, - ) -> Result<(), Error> { + title: String, + description: String, + ) -> Result { self.connection.exec_drop( "INSERT INTO `content` SET `channel_item_id` = ?, `source_id` = ?, `title` = ?, `description` = ?", (channel_item_id, source_id, title, description ), - ) + )?; + Ok(self.connection.last_insert_id()) } } @@ -126,6 +143,7 @@ pub struct ChannelItem { #[derive(Debug, PartialEq, Eq)] pub struct Content { + pub content_id: u64, pub channel_item_id: u64, /// None if the original `title` and `description` values /// parsed from the channel item on crawl