change index update logic

This commit is contained in:
yggverse 2025-02-12 04:43:59 +02:00
parent ffe1b47441
commit 96baf214b7
2 changed files with 64 additions and 115 deletions

View file

@ -2,7 +2,6 @@ mod argument;
mod output; mod output;
mod path; mod path;
use chrono::{DateTime, FixedOffset};
use output::Output; use output::Output;
use std::error::Error; use std::error::Error;
@ -16,25 +15,18 @@ fn main() -> Result<(), Box<dyn Error>> {
output.debug("crawler started"); output.debug("crawler started");
let mut status = None;
loop { loop {
crawl(&argument.source, &argument.target, &output, &mut status)?; crawl(&argument.source, &argument.target, &output)?;
sleep(Duration::from_secs(argument.update)); sleep(Duration::from_secs(argument.update));
} }
} }
fn crawl( fn crawl(source: &str, target: &str, output: &Output) -> Result<(), Box<dyn Error>> {
source: &str,
target: &str,
output: &Output,
status: &mut Option<DateTime<FixedOffset>>,
) -> Result<(), Box<dyn Error>> {
use path::Path; use path::Path;
use reqwest::blocking::get; use reqwest::blocking::get;
use rss::Channel; use rss::Channel;
use std::{ use std::{
fs::{metadata, File}, fs::{metadata, File, OpenOptions},
io::Write, io::Write,
}; };
use url::Url; use url::Url;
@ -44,115 +36,70 @@ fn crawl(
let mut total = 0; let mut total = 0;
let mut exist = 0; let mut exist = 0;
let channel = Channel::read_from(&get(source)?.bytes()?[..])?; // handle feed items
for item in Channel::read_from(&get(source)?.bytes()?[..])?
.items()
.iter()
{
total += 1;
match channel.pub_date() { let mut data = Vec::new();
Some(pub_date) => {
// update `index.gmi` on channel `pub_date` change
{
let remote_time = chrono::DateTime::parse_from_rfc2822(pub_date)?;
if status.is_none() || status.is_some_and(|local_time| local_time != remote_time) { let path = match item.pub_date() {
// update global state to skip `index.gmi` overwrites without changes Some(pub_date) => {
*status = Some(remote_time); let path = Path::build(target, pub_date, true)?;
if metadata(path.item()).is_ok() {
let index_path = Path::build(target, pub_date, true)?; exist += 1;
continue;
// build `index.gmi` members
let (mut file, mut data) = (File::create(index_path.index())?, Vec::new());
// collect `index.gmi` data
for item in channel.items().iter() {
match item.pub_date() {
Some(pub_date) => {
let item_path = Path::build(target, pub_date, true)?;
// skip not relevant records from `index.gmi`
if item_path.path != index_path.path {
continue;
}
data.push(format!("=> {} {pub_date}", item_path.item));
if let Some(description) = item.description() {
data.push(description.to_string());
}
if let Some(content) = item.content() {
data.push(content.to_string());
}
}
None => {
output.warning("item skipped as `pub_date` required by application")
}
}
}
// update `index.gmi` file with new version
file.write_all(data.join("\n\n").as_bytes())?;
output.debug("index file updated");
} }
data.push(format!("# {pub_date}"));
path
} }
None => {
// handle feed items output.warning("item skipped as `pub_date` required by application");
for item in channel.items().iter() { continue;
total += 1;
// handle item data
let mut data = Vec::new();
let path = match item.pub_date() {
Some(pub_date) => {
let path = Path::build(target, pub_date, true)?;
if metadata(path.item()).is_ok() {
exist += 1;
continue; // skip existing records
}
data.push(format!("# {pub_date}"));
path
}
None => {
output.warning("item skipped as `pub_date` required by application");
continue;
}
};
if let Some(description) = item.description() {
data.push(description.to_string());
}
if let Some(content) = item.content() {
data.push(content.to_string());
}
/* @TODO local storage
if let Some(enclosure) = item.enclosure() {
match enclosure.mime_type.as_str() {
"image/jpeg" => todo!(),
_ => todo!(),
}
} */
if let Some(link) = item.link() {
data.push(match Url::parse(link) {
Ok(url) => {
if let Some(host) = url.host_str() {
format!("=> {link} {host}")
} else {
format!("=> {link}")
}
}
Err(e) => {
output.warning(&e.to_string());
format!("=> {link}")
}
})
}
// record new item file
File::create(path.item())?.write_all(data.join("\n\n").as_bytes())?;
} }
};
let mut index = match OpenOptions::new().append(true).open(path.index()) {
Ok(index) => index,
Err(_) => {
let mut index = File::create_new(path.index())?;
index.write_all(format!("# {}\n", path.time.to_rfc2822()).as_bytes())?;
index
}
};
index.write_all(format!("\n=> {} {}\n", path.item, path.time).as_bytes())?;
if let Some(description) = item.description() {
index.write_all(format!("\n{description}\n").as_bytes())?;
data.push(description.to_string());
} }
None => output.warning("channel skipped as `pub_date` required by application"),
if let Some(content) = item.content() {
index.write_all(format!("\n{content}\n").as_bytes())?;
data.push(content.to_string());
}
if let Some(link) = item.link() {
data.push(match Url::parse(link) {
Ok(url) => {
if let Some(host) = url.host_str() {
format!("=> {link} {host}")
} else {
format!("=> {link}")
}
}
Err(e) => {
output.warning(&e.to_string());
format!("=> {link}")
}
})
}
File::create(path.item())?.write_all(data.join("\n\n").as_bytes())?;
} }
output.debug(&format!( output.debug(&format!(

View file

@ -1,9 +1,11 @@
use chrono::{DateTime, FixedOffset};
use std::error::Error; use std::error::Error;
use std::path::MAIN_SEPARATOR; use std::path::MAIN_SEPARATOR;
pub struct Path { pub struct Path {
pub item: String, pub item: String,
pub path: String, pub path: String,
pub time: DateTime<FixedOffset>,
} }
impl Path { impl Path {
@ -32,7 +34,7 @@ impl Path {
std::fs::create_dir_all(&path)?; std::fs::create_dir_all(&path)?;
} }
Ok(Path { item, path }) Ok(Path { item, path, time })
} }
// Getters // Getters