feat: remove dissolve + add simpler html tag stripper + html entities

This commit is contained in:
VC
2023-11-16 15:42:10 +01:00
parent 1fdea7f69d
commit 6fccbf8d16
4 changed files with 117 additions and 411 deletions

View File

@@ -1,15 +1,15 @@
use dissolve::strip_html_tags;
use html_escape::decode_html_entities;
use megalodon::entities::status::Tag;
use regex::Regex;
use std::error::Error;
pub fn strip_everything(content: &str, tags: &Vec<Tag>) -> Result<String, Box<dyn Error>> {
let mut res =
strip_html_tags(&content.replace("</p><p>", "\n\n").replace("<br />", "\n")).join("");
let mut res = strip_html_tags(&content.replace("</p><p>", "\n\n").replace("<br />", "\n"));
strip_mastodon_tags(&mut res, tags).unwrap();
res = res.trim_end_matches('\n').trim_end_matches(' ').to_string();
res = decode_html_entities(&res).to_string();
Ok(res)
}
@@ -22,3 +22,64 @@ fn strip_mastodon_tags(content: &mut String, tags: &Vec<Tag>) -> Result<(), Box<
Ok(())
}
fn strip_html_tags(input: &str) -> String {
let mut data = String::new();
let mut inside = false;
for c in input.chars() {
if c == '<' {
inside = true;
continue;
}
if c == '>' {
inside = false;
continue;
}
if !inside {
data.push(c);
}
}
data
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_mastodon_tags() {
let tags = vec![
Tag {
name: "putaclic".to_string(),
url: "https://m.nintendojo.fr/tags/putaclic".to_string(),
},
Tag {
name: "tamerelol".to_string(),
url: "https://m.nintendojo.fr/tags/tamerelol".to_string(),
},
Tag {
name: "JeFaisNawakEnCamelCase".to_string(),
url: "https://m.nintendojo.fr/tags/jefaisnawakencamelcase".to_string(),
},
];
let mut content =
"Cest super ça! #putaclic #TAMERELOL #JeFaisNawakEnCamelCase".to_string();
let sample = "Cest super ça! ".to_string();
strip_mastodon_tags(&mut content, &tags).unwrap();
assert_eq!(content, sample);
}
#[test]
fn test_strip_everything() {
let content = "<p>Ce soir à 21h, c&#39;est le Dojobar ! Au programme ce soir, une rétrospective sur la série Mario &amp; Luigi.<br />Comme d&#39;hab, le Twitch sera ici : <a href=\"https://twitch.tv/nintendojofr\" target=\"_blank\" rel=\"nofollow noopener noreferrer\" translate=\"no\"><span class=\"invisible\">https://</span><span class=\"\">twitch.tv/nintendojofr</span><span class=\"invisible\"></span></a><br />Ou juste l&#39;audio là : <a href=\"https://nintendojo.fr/dojobar\" target=\"_blank\" rel=\"nofollow noopener noreferrer\" translate=\"no\"><span class=\"invisible\">https://</span><span class=\"\">nintendojo.fr/dojobar</span><span class=\"invisible\"></span></a><br />A toute !</p>";
let expected_result = "Ce soir à 21h, c'est le Dojobar ! Au programme ce soir, une rétrospective sur la série Mario & Luigi.\nComme d'hab, le Twitch sera ici : https://twitch.tv/nintendojofr\nOu juste l'audio là : https://nintendojo.fr/dojobar\nA toute !".to_string();
let result = strip_everything(content, &vec![]).unwrap();
assert_eq!(result, expected_result);
}
}