From 0426d306bc4dd3e8b8417abfc25f2f93e28cad50 Mon Sep 17 00:00:00 2001 From: yggverse Date: Fri, 20 Mar 2026 04:48:08 +0200 Subject: [PATCH] update parser logic --- Cargo.lock | 389 +--------------------------------------------------- Cargo.toml | 3 +- src/main.rs | 60 ++------ 3 files changed, 22 insertions(+), 430 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1ac0c64..2c5ca4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,19 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "ahash" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" -dependencies = [ - "cfg-if", - "const-random", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "aho-corasick" version = "1.1.4" @@ -24,12 +11,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "allocator-api2" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -95,27 +76,12 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" -[[package]] -name = "astral-tl" -version = "0.7.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d90933ffb0f97e2fc2e0de21da9d3f20597b804012d199843a6fe7c2810d28f3" -dependencies = [ - "memchr", -] - [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "base64" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" - [[package]] name = "bitflags" version = "2.11.0" @@ -203,44 +169,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" -[[package]] -name = "const-random" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" -dependencies = [ - "const-random-macro", -] - -[[package]] -name = "const-random-macro" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" -dependencies = [ - "getrandom", - "once_cell", - "tiny-keccak", -] - [[package]] name = "core-foundation-sys" version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" -[[package]] -name = "crunchy" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" - -[[package]] -name = "equivalent" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" - [[package]] name = "fallible-iterator" version = "0.3.0" @@ -253,12 +187,6 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" -[[package]] -name = "fastrand" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" - [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -272,10 +200,11 @@ dependencies = [ "anyhow", "chrono", "clap", - "html-to-markdown-rs", + "html-escape", "log", "regex", "rusqlite", + "strip-tags", "tracing-subscriber", ] @@ -285,25 +214,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" -[[package]] -name = "getrandom" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - [[package]] name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" dependencies = [ - "allocator-api2", - "equivalent", "foldhash", ] @@ -331,35 +247,6 @@ dependencies = [ "utf8-width", ] -[[package]] -name = "html-to-markdown-rs" -version = "2.28.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9377e16af590b764fd98fd176027cf8831c5335f8964f3f643753e38913a4e" -dependencies = [ - "ahash", - "astral-tl", - "base64", - "html-escape", - "html5ever", - "lru", - "once_cell", - "regex", - "serde", - "serde_json", - "thiserror", -] - -[[package]] -name = "html5ever" -version = "0.38.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1054432bae2f14e0061e33d23402fbaa67a921d319d56adc6bcf887ddad1cbc2" -dependencies = [ - "log", - "markup5ever", -] - [[package]] name = "iana-time-zone" version = "0.1.65" @@ -390,12 +277,6 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" -[[package]] -name = "itoa" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" - [[package]] name = "js-sys" version = "0.3.91" @@ -428,41 +309,12 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "lock_api" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" -dependencies = [ - "scopeguard", -] - [[package]] name = "log" version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" -[[package]] -name = "lru" -version = "0.16.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" -dependencies = [ - "hashbrown", -] - -[[package]] -name = "markup5ever" -version = "0.38.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8983d30f2915feeaaab2d6babdd6bc7e9ed1a00b66b5e6d74df19aa9c0e91862" -dependencies = [ - "log", - "tendril", - "web_atoms", -] - [[package]] name = "matchers" version = "0.2.0" @@ -478,12 +330,6 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" -[[package]] -name = "new_debug_unreachable" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" - [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -514,68 +360,6 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" -[[package]] -name = "parking_lot" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-link", -] - -[[package]] -name = "phf" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" -dependencies = [ - "phf_shared", - "serde", -] - -[[package]] -name = "phf_codegen" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" -dependencies = [ - "fastrand", - "phf_shared", -] - -[[package]] -name = "phf_shared" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" -dependencies = [ - "siphasher", -] - [[package]] name = "pin-project-lite" version = "0.2.17" @@ -588,12 +372,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "precomputed-hash" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" - [[package]] name = "proc-macro2" version = "1.0.106" @@ -612,15 +390,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "redox_syscall" -version = "0.5.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" -dependencies = [ - "bitflags", -] - [[package]] name = "regex" version = "1.12.3" @@ -682,55 +451,6 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] -name = "serde" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" -dependencies = [ - "serde_core", - "serde_derive", -] - -[[package]] -name = "serde_core" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.228" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.149" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" -dependencies = [ - "itoa", - "memchr", - "serde", - "serde_core", - "zmij", -] - [[package]] name = "sharded-slab" version = "0.1.7" @@ -746,12 +466,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" -[[package]] -name = "siphasher" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" - [[package]] name = "smallvec" version = "1.15.1" @@ -771,28 +485,10 @@ dependencies = [ ] [[package]] -name = "string_cache" -version = "0.9.0" +name = "strip-tags" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901" -dependencies = [ - "new_debug_unreachable", - "parking_lot", - "phf_shared", - "precomputed-hash", -] - -[[package]] -name = "string_cache_codegen" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69" -dependencies = [ - "phf_generator", - "phf_shared", - "proc-macro2", - "quote", -] +checksum = "ecd2b127e68202f5f285a116f616d5d11735cca5e4befaea0347becd445b05b2" [[package]] name = "strsim" @@ -811,16 +507,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "tendril" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4790fc369d5a530f4b544b094e31388b9b3a37c0f4652ade4505945f5660d24" -dependencies = [ - "new_debug_unreachable", - "utf-8", -] - [[package]] name = "thiserror" version = "2.0.18" @@ -850,15 +536,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "tiny-keccak" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" -dependencies = [ - "crunchy", -] - [[package]] name = "tracing" version = "0.1.44" @@ -914,12 +591,6 @@ version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - [[package]] name = "utf8-width" version = "0.1.8" @@ -944,18 +615,6 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - -[[package]] -name = "wasi" -version = "0.11.1+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" - [[package]] name = "wasm-bindgen" version = "0.2.114" @@ -1001,18 +660,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "web_atoms" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57a9779e9f04d2ac1ce317aee707aa2f6b773afba7b931222bff6983843b1576" -dependencies = [ - "phf", - "phf_codegen", - "string_cache", - "string_cache_codegen", -] - [[package]] name = "windows-core" version = "0.62.2" @@ -1080,29 +727,3 @@ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ "windows-link", ] - -[[package]] -name = "zerocopy" -version = "0.8.42" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.8.42" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "zmij" -version = "1.0.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index 5787cf6..2325925 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,8 +13,9 @@ repository = "https://github.com/YGGverse/flarumdown" anyhow = "1.0.102" chrono = "0.4.44" clap = { version = "4.6.0", features = ["derive"] } -html-to-markdown-rs = "2.28.2" +html-escape = "0.2.13" log = "0.4.29" regex = "1.12.3" rusqlite = { version = "0.39.0", features = ["chrono"]} +strip-tags = "0.1.0" tracing-subscriber = { version = "0.3.23", features = ["env-filter"] } diff --git a/src/main.rs b/src/main.rs index 39c6130..f30151c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,6 @@ use chrono::{DateTime, Local, Utc}; use clap::Parser; use config::Config; use database::Database; -use html_to_markdown_rs::convert; use log::*; use regex::{Captures, Regex}; use std::{ @@ -174,10 +173,9 @@ fn main() -> Result<()> { )); let mut uploads = HashSet::new(); content.push({ - let mut post = post_format(&convert( - pre_format(&post.content, &mut uploads).trim(), - None, - )?); + let mut post = post_format( + pre_format(&post.content, &mut uploads).trim() + ); for d in &discussions { post = post .replace( @@ -240,7 +238,7 @@ fn main() -> Result<()> { } } } - content.push("---\n".into()) + content.push("\n---\n".into()) } content.push(format!("Generated at {}\n", Utc::now())); for refer in &config.refer { @@ -262,46 +260,18 @@ fn main() -> Result<()> { } fn pre_format(data: &str, uploads: &mut HashSet) -> String { - Regex::new(r"[^<]+") + html_escape::decode_html_entities(&strip_tags::strip_tags( + &Regex::new(r#"(?s)]*>[^<]*"#) .unwrap() - .replace_all( - &Regex::new(r"[^<]+").unwrap().replace_all( - &Regex::new(r"(?s)]+>([^<]+)") - .unwrap() - .replace_all(data, |c: &Captures| { - uploads.insert( - Regex::new(r#"url="?([^\s]+)"?"#) - .unwrap() - .captures(&c[1]) - .unwrap()[1] - .trim_start_matches("/") - .trim_start_matches("d/") - .into(), - ); - format!( - "", - c[1].replace(" url=d/", " url=") - .replace(" url=/", " url=") - .replace(" url=", " src=") - ) - }), - "", - ), - "", - ) - .replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") - .replace(" url=\"/", " href=\"") - .replace(" url=\"d/", " href=\"") - .replace(" url=", " href=") - .replace("", "") - .replace("", "") + .replace_all(data, |c: &Captures| { + let rel = c[2] + .trim_start_matches("/") + .trim_start_matches("d/"); + uploads.insert(rel.into()); + format!("![{}]({rel})", c.get(1).map(|s|s.as_str()).unwrap_or_default()) + }), + )) + .into() } fn post_format(data: &str) -> String {