update parser logic

This commit is contained in:
yggverse 2026-03-20 04:48:08 +02:00
parent 666b57dffc
commit 0426d306bc
3 changed files with 22 additions and 430 deletions

389
Cargo.lock generated
View file

@ -2,19 +2,6 @@
# It is not intended for manual editing.
version = 4
[[package]]
name = "ahash"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if",
"const-random",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.4"
@ -24,12 +11,6 @@ dependencies = [
"memchr",
]
[[package]]
name = "allocator-api2"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
[[package]]
name = "android_system_properties"
version = "0.1.5"
@ -95,27 +76,12 @@ version = "1.0.102"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
[[package]]
name = "astral-tl"
version = "0.7.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d90933ffb0f97e2fc2e0de21da9d3f20597b804012d199843a6fe7c2810d28f3"
dependencies = [
"memchr",
]
[[package]]
name = "autocfg"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "base64"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "bitflags"
version = "2.11.0"
@ -203,44 +169,12 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
[[package]]
name = "const-random"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359"
dependencies = [
"const-random-macro",
]
[[package]]
name = "const-random-macro"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
dependencies = [
"getrandom",
"once_cell",
"tiny-keccak",
]
[[package]]
name = "core-foundation-sys"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "crunchy"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
[[package]]
name = "equivalent"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "fallible-iterator"
version = "0.3.0"
@ -253,12 +187,6 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "fastrand"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
@ -272,10 +200,11 @@ dependencies = [
"anyhow",
"chrono",
"clap",
"html-to-markdown-rs",
"html-escape",
"log",
"regex",
"rusqlite",
"strip-tags",
"tracing-subscriber",
]
@ -285,25 +214,12 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
[[package]]
name = "getrandom"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "hashbrown"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
]
@ -331,35 +247,6 @@ dependencies = [
"utf8-width",
]
[[package]]
name = "html-to-markdown-rs"
version = "2.28.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9377e16af590b764fd98fd176027cf8831c5335f8964f3f643753e38913a4e"
dependencies = [
"ahash",
"astral-tl",
"base64",
"html-escape",
"html5ever",
"lru",
"once_cell",
"regex",
"serde",
"serde_json",
"thiserror",
]
[[package]]
name = "html5ever"
version = "0.38.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1054432bae2f14e0061e33d23402fbaa67a921d319d56adc6bcf887ddad1cbc2"
dependencies = [
"log",
"markup5ever",
]
[[package]]
name = "iana-time-zone"
version = "0.1.65"
@ -390,12 +277,6 @@ version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
[[package]]
name = "itoa"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
[[package]]
name = "js-sys"
version = "0.3.91"
@ -428,41 +309,12 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "lock_api"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
dependencies = [
"scopeguard",
]
[[package]]
name = "log"
version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
[[package]]
name = "lru"
version = "0.16.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593"
dependencies = [
"hashbrown",
]
[[package]]
name = "markup5ever"
version = "0.38.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8983d30f2915feeaaab2d6babdd6bc7e9ed1a00b66b5e6d74df19aa9c0e91862"
dependencies = [
"log",
"tendril",
"web_atoms",
]
[[package]]
name = "matchers"
version = "0.2.0"
@ -478,12 +330,6 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]]
name = "nu-ansi-term"
version = "0.50.3"
@ -514,68 +360,6 @@ version = "1.70.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "parking_lot"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
dependencies = [
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.9.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"smallvec",
"windows-link",
]
[[package]]
name = "phf"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
dependencies = [
"phf_shared",
"serde",
]
[[package]]
name = "phf_codegen"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
dependencies = [
"fastrand",
"phf_shared",
]
[[package]]
name = "phf_shared"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project-lite"
version = "0.2.17"
@ -588,12 +372,6 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro2"
version = "1.0.106"
@ -612,15 +390,6 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "redox_syscall"
version = "0.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
dependencies = [
"bitflags",
]
[[package]]
name = "regex"
version = "1.12.3"
@ -682,55 +451,6 @@ version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
[[package]]
name = "scopeguard"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.149"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
dependencies = [
"itoa",
"memchr",
"serde",
"serde_core",
"zmij",
]
[[package]]
name = "sharded-slab"
version = "0.1.7"
@ -746,12 +466,6 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "siphasher"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
[[package]]
name = "smallvec"
version = "1.15.1"
@ -771,28 +485,10 @@ dependencies = [
]
[[package]]
name = "string_cache"
version = "0.9.0"
name = "strip-tags"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared",
"precomputed-hash",
]
[[package]]
name = "string_cache_codegen"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
checksum = "ecd2b127e68202f5f285a116f616d5d11735cca5e4befaea0347becd445b05b2"
[[package]]
name = "strsim"
@ -811,16 +507,6 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tendril"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4790fc369d5a530f4b544b094e31388b9b3a37c0f4652ade4505945f5660d24"
dependencies = [
"new_debug_unreachable",
"utf-8",
]
[[package]]
name = "thiserror"
version = "2.0.18"
@ -850,15 +536,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "tiny-keccak"
version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
dependencies = [
"crunchy",
]
[[package]]
name = "tracing"
version = "0.1.44"
@ -914,12 +591,6 @@ version = "1.0.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf8-width"
version = "0.1.8"
@ -944,18 +615,6 @@ version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "wasi"
version = "0.11.1+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
[[package]]
name = "wasm-bindgen"
version = "0.2.114"
@ -1001,18 +660,6 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "web_atoms"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57a9779e9f04d2ac1ce317aee707aa2f6b773afba7b931222bff6983843b1576"
dependencies = [
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
]
[[package]]
name = "windows-core"
version = "0.62.2"
@ -1080,29 +727,3 @@ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
dependencies = [
"windows-link",
]
[[package]]
name = "zerocopy"
version = "0.8.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "zmij"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"

View file

@ -13,8 +13,9 @@ repository = "https://github.com/YGGverse/flarumdown"
anyhow = "1.0.102"
chrono = "0.4.44"
clap = { version = "4.6.0", features = ["derive"] }
html-to-markdown-rs = "2.28.2"
html-escape = "0.2.13"
log = "0.4.29"
regex = "1.12.3"
rusqlite = { version = "0.39.0", features = ["chrono"]}
strip-tags = "0.1.0"
tracing-subscriber = { version = "0.3.23", features = ["env-filter"] }

View file

@ -6,7 +6,6 @@ use chrono::{DateTime, Local, Utc};
use clap::Parser;
use config::Config;
use database::Database;
use html_to_markdown_rs::convert;
use log::*;
use regex::{Captures, Regex};
use std::{
@ -174,10 +173,9 @@ fn main() -> Result<()> {
));
let mut uploads = HashSet::new();
content.push({
let mut post = post_format(&convert(
pre_format(&post.content, &mut uploads).trim(),
None,
)?);
let mut post = post_format(
pre_format(&post.content, &mut uploads).trim()
);
for d in &discussions {
post = post
.replace(
@ -240,7 +238,7 @@ fn main() -> Result<()> {
}
}
}
content.push("---\n".into())
content.push("\n---\n".into())
}
content.push(format!("Generated at {}\n", Utc::now()));
for refer in &config.refer {
@ -262,46 +260,18 @@ fn main() -> Result<()> {
}
fn pre_format(data: &str, uploads: &mut HashSet<PathBuf>) -> String {
Regex::new(r"<e>[^<]+</e>")
html_escape::decode_html_entities(&strip_tags::strip_tags(
&Regex::new(r#"(?s)<UPL-IMAGE-PREVIEW\s+alt="([^"]*)"\s+.*?url="([^"]*)"\s+[^>]*>[^<]*</UPL-IMAGE-PREVIEW>"#)
.unwrap()
.replace_all(
&Regex::new(r"<s>[^<]+</s>").unwrap().replace_all(
&Regex::new(r"(?s)<UPL-IMAGE-PREVIEW[^>]+>([^<]+)</UPL-IMAGE-PREVIEW>")
.unwrap()
.replace_all(data, |c: &Captures| {
uploads.insert(
Regex::new(r#"url="?([^\s]+)"?"#)
.unwrap()
.captures(&c[1])
.unwrap()[1]
.trim_start_matches("/")
.trim_start_matches("d/")
.into(),
);
format!(
"<img{}>",
c[1].replace(" url=d/", " url=")
.replace(" url=/", " url=")
.replace(" url=", " src=")
)
}),
"",
),
"",
)
.replace("<C", "<code")
.replace("</C>", "</code>")
.replace("<QUOTE", "<blockquote")
.replace("</QUOTE>", "</blockquote>")
.replace("<LIST", "<ul")
.replace("</LIST>", "</ul>")
.replace("<URL", "<a")
.replace("</URL>", "</a>")
.replace(" url=\"/", " href=\"")
.replace(" url=\"d/", " href=\"")
.replace(" url=", " href=")
.replace("<r>", "")
.replace("</r>", "")
.replace_all(data, |c: &Captures| {
let rel = c[2]
.trim_start_matches("/")
.trim_start_matches("d/");
uploads.insert(rel.into());
format!("![{}]({rel})", c.get(1).map(|s|s.as_str()).unwrap_or_default())
}),
))
.into()
}
fn post_format(data: &str) -> String {