Merge branch '3-cut-toot-in-half-when-they-re-too-big' into 'main'

Cut toot in half Closes #3 See merge request veretcle/oolatoocs!5
feat: move media generation list to twitter.rs to avoid clutter
2025-07-21 04:51:17 +02:00 · 2023-11-20 14:53:19 +00:00 · 2023-11-20 15:32:02 +01:00 · 2023-11-20 15:32:02 +01:00 · 2023-11-20 15:32:02 +01:00 · 2023-11-17 19:30:36 +00:00
7 changed files with 405 additions and 584 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,19 +1,20 @@
 [package]
 name = "oolatoocs"
-version = "1.0.0"
+version = "1.3.1"
 edition = "2021"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
 clap = "^4"
-dissolve = "0.2.2"
 env_logger = "^0.10"
+futures = "^0.3"
+html-escape = "^0.2"
 log = "^0.4"
 megalodon = "^0.11"
 oauth1-request = "^0.6"
-regex = "1.10.2"
-reqwest = { version = "0.11.22", features = ["json", "stream", "multipart"] }
+regex = "^1.10"
+reqwest = { version = "^0.11", features = ["json", "stream", "multipart"] }
 rusqlite = "^0.27"
 serde = { version = "^1.0", features = ["derive"] }
 tokio = { version = "^1.33", features = ["rt-multi-thread", "macros", "time"] }
--- a/src/config.rs
+++ b/src/config.rs
@@ -8,7 +8,7 @@ pub struct Config {
    pub twitter: TwitterConfig,
 }

-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct TwitterConfig {
    pub consumer_key: String,
    pub consumer_secret: String,
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -14,13 +14,12 @@ use mastodon::get_mastodon_timeline_since;
 pub use mastodon::register;

 mod utils;
-use utils::strip_everything;
+use utils::{generate_multi_tweets, strip_everything};

 mod twitter;
 #[allow(unused_imports)]
-use twitter::{post_tweet, upload_chunk_media, upload_simple_media};
+use twitter::{generate_media_ids, post_tweet};

-use megalodon::entities::attachment::AttachmentType;
 use rusqlite::Connection;

 #[tokio::main]
@@ -37,55 +36,30 @@ pub async fn run(config: &Config) {
        .unwrap_or_else(|e| panic!("Cannot get instance: {}", e));

    for toot in timeline {
-        let Ok(tweet_content) = strip_everything(&toot.content, &toot.tags) else {
+        let Ok(mut tweet_content) = strip_everything(&toot.content, &toot.tags) else {
            continue; // skip in case we can’t strip something
        };
-        let mut medias: Vec<u64> = vec![];
-
-        // if we wanted to cut toot in half, now would be the right time to do so
-
-        // treats media
-        for media in toot.media_attachments {
-            let id = match media.r#type {
-                AttachmentType::Image => {
-                    let Ok(id) =
-                        upload_simple_media(&config.twitter, &media.url, &media.description).await
-                    else {
-                        continue;
-                    };
-                    id
-                }
-                AttachmentType::Gifv => {
-                    let Ok(id) = upload_chunk_media(&config.twitter, &media.url, "tweet_gif").await
-                    else {
-                        continue;
-                    };
-                    id
-                }
-                AttachmentType::Video => {
-                    let Ok(id) =
-                        upload_chunk_media(&config.twitter, &media.url, "tweet_video").await
-                    else {
-                        continue;
-                    };
-                    id
-                }
-                _ => {
-                    continue;
-                }
-            };
-
-            medias.push(id);
-        }

        // threads if necessary
-        let reply_to = toot.in_reply_to_id.and_then(|t| {
+        let mut reply_to = toot.in_reply_to_id.and_then(|t| {
            read_state(&conn, Some(t.parse::<u64>().unwrap()))
                .ok()
                .flatten()
                .map(|s| s.tweet_id)
        });

+        // if the toot is too long, we cut it in half here
+        if let Some((first_half, second_half)) = generate_multi_tweets(&tweet_content) {
+            tweet_content = second_half;
+            let reply_id = post_tweet(&config.twitter, &first_half, &[], &reply_to)
+                .await
+                .unwrap_or_else(|e| panic!("Cannot post the first half of {}: {}", &toot.id, e));
+            reply_to = Some(reply_id);
+        };
+
+        // treats medias
+        let medias = generate_media_ids(&config.twitter, &toot.media_attachments).await;
+
        // posts corresponding tweet
        let tweet_id = post_tweet(&config.twitter, &tweet_content, &medias, &reply_to)
            .await
--- a/src/mastodon.rs
+++ b/src/mastodon.rs
@@ -34,7 +34,6 @@ pub async fn get_mastodon_timeline_since(
        .await?
        .json()
        .iter()
-        .cloned()
        .filter(|t| {
            // this excludes the reply to other users
            t.in_reply_to_account_id.is_none()
@@ -45,6 +44,7 @@ pub async fn get_mastodon_timeline_since(
        .filter(|t| t.visibility == StatusVisibility::Public) // excludes everything that isn’t
        // public
        .filter(|t| t.reblog.is_none()) // excludes reblogs
+        .cloned()
        .collect();

    timeline.reverse();
--- a/src/twitter.rs
+++ b/src/twitter.rs
@@ -1,6 +1,8 @@
 use crate::config::TwitterConfig;
 use crate::error::OolatoocsError;
-use log::debug;
+use futures::{stream, StreamExt};
+use log::{debug, error, warn};
+use megalodon::entities::attachment::{Attachment, AttachmentType};
 use oauth1_request::Token;
 use reqwest::{
    multipart::{Form, Part},
@@ -99,12 +101,53 @@ fn get_token(config: &TwitterConfig) -> Token {
    )
 }

+pub async fn generate_media_ids(config: &TwitterConfig, media_attach: &[Attachment]) -> Vec<u64> {
+    let mut medias: Vec<u64> = vec![];
+
+    let media_attachments = media_attach.to_owned();
+    let mut stream = stream::iter(media_attachments)
+        .map(|media| {
+            let twitter_config = config.clone();
+            tokio::task::spawn(async move {
+                match media.r#type {
+                    AttachmentType::Image => {
+                        upload_simple_media(&twitter_config, &media.url, &media.description).await
+                    }
+                    AttachmentType::Gifv => {
+                        upload_chunk_media(&twitter_config, &media.url, "tweet_gif").await
+                    }
+                    AttachmentType::Video => {
+                        upload_chunk_media(&twitter_config, &media.url, "tweet_video").await
+                    }
+                    _ => Err::<u64, Box<dyn Error + Send + Sync>>(
+                        OolatoocsError::new(&format!(
+                            "Cannot treat this type of media: {}",
+                            &media.url
+                        ))
+                        .into(),
+                    ),
+                }
+            })
+        })
+        .buffered(4);
+
+    while let Some(result) = stream.next().await {
+        match result {
+            Ok(Ok(v)) => medias.push(v),
+            Ok(Err(e)) => warn!("Cannot treat media: {}", e),
+            Err(e) => error!("Something went wrong when joining the main thread: {}", e),
+        }
+    }
+
+    medias
+}
+
 /// This function uploads simple images from Mastodon to Twitter and returns the media id from Twitter
-pub async fn upload_simple_media(
+async fn upload_simple_media(
    config: &TwitterConfig,
    u: &str,
    d: &Option<String>,
-) -> Result<u64, Box<dyn Error>> {
+) -> Result<u64, Box<dyn Error + Send + Sync>> {
    // initiate request parameters
    let empty_request = EmptyRequest {}; // Why? Because fuck you, that’s why!
    let token = get_token(config);
@@ -152,7 +195,11 @@ pub async fn upload_simple_media(
 }

 /// This function updates the metadata given the current media_id and token
-async fn metadata_create(config: &TwitterConfig, id: u64, m: &str) -> Result<(), Box<dyn Error>> {
+async fn metadata_create(
+    config: &TwitterConfig,
+    id: u64,
+    m: &str,
+) -> Result<(), Box<dyn Error + Send + Sync>> {
    let token = get_token(config);
    let empty_request = EmptyRequest {};

@@ -187,11 +234,11 @@ async fn metadata_create(config: &TwitterConfig, id: u64, m: &str) -> Result<(),
 }

 /// This posts video/gif to Twitter and returns the media id from Twitter
-pub async fn upload_chunk_media(
+async fn upload_chunk_media(
    config: &TwitterConfig,
    u: &str,
    t: &str,
-) -> Result<u64, Box<dyn Error>> {
+) -> Result<u64, Box<dyn Error + Send + Sync>> {
    let empty_request = EmptyRequest {};
    let token = get_token(config);

--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,15 +1,59 @@
-use dissolve::strip_html_tags;
+use html_escape::decode_html_entities;
 use megalodon::entities::status::Tag;
 use regex::Regex;
 use std::error::Error;

+/// Generate 2 contents out of 1 if that content is > 280 chars, None else
+pub fn generate_multi_tweets(content: &str) -> Option<(String, String)> {
+    // Twitter webforms are utf-8 encoded, so we cannot count on len(), we don’t need
+    // encode_utf16().count()
+    if twitter_count(content) <= 280 {
+        return None;
+    }
+
+    let split_content = content.split(' ');
+
+    let split_count = split_content.clone().count();
+
+    let first_half: String = split_content
+        .clone()
+        .take(split_count / 2)
+        .collect::<Vec<_>>()
+        .join(" ");
+    let second_half: String = split_content
+        .clone()
+        .skip(split_count / 2)
+        .collect::<Vec<_>>()
+        .join(" ");
+
+    Some((first_half, second_half))
+}
+
+/// Twitter doesn’t count words the same we do, so you’ll have to improvise
+fn twitter_count(content: &str) -> usize {
+    let mut count = 0;
+
+    let split_content = content.split(' ');
+    count += split_content.clone().count() - 1; // count the spaces
+
+    for word in split_content {
+        if word.starts_with("http://") || word.starts_with("https://") {
+            count += 23;
+        } else {
+            count += word.chars().count();
+        }
+    }
+
+    count
+}
+
 pub fn strip_everything(content: &str, tags: &Vec<Tag>) -> Result<String, Box<dyn Error>> {
-    let mut res =
-        strip_html_tags(&content.replace("</p><p>", "\n\n").replace("<br />", "\n")).join("");
+    let mut res = strip_html_tags(&content.replace("</p><p>", "\n\n").replace("<br />", "\n"));

    strip_mastodon_tags(&mut res, tags).unwrap();

    res = res.trim_end_matches('\n').trim_end_matches(' ').to_string();
+    res = decode_html_entities(&res).to_string();

    Ok(res)
 }
@@ -22,3 +66,109 @@ fn strip_mastodon_tags(content: &mut String, tags: &Vec<Tag>) -> Result<(), Box<

    Ok(())
 }
+
+fn strip_html_tags(input: &str) -> String {
+    let mut data = String::new();
+    let mut inside = false;
+
+    for c in input.chars() {
+        if c == '<' {
+            inside = true;
+            continue;
+        }
+        if c == '>' {
+            inside = false;
+            continue;
+        }
+        if !inside {
+            data.push(c);
+        }
+    }
+
+    data
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_twitter_count() {
+        let content = "tamerelol?! 🐵";
+
+        assert_eq!(twitter_count(content), content.chars().count());
+
+        let content = "Shoot out to https://y.ml/ !";
+
+        assert_eq!(twitter_count(content), 38);
+
+        let content = "this is the link https://www.google.com/tamerelol/youpi/tonperemdr/tarace.html if you like! What if I shit a final";
+
+        assert_eq!(twitter_count(content), 76);
+    }
+
+    #[test]
+    fn test_generate_multi_tweets_to_none() {
+        // test « standard » text
+        let tweet_content =
+            "LOLOLOL, je suis bien trop petit pour être coupé en deux voyons :troll:".to_string();
+
+        let youpi = generate_multi_tweets(&tweet_content);
+
+        assert_eq!(None, youpi);
+
+        // test with « complex » emoji (2 utf-8 chars)
+        let tweet_content = "🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷🇫🇷".to_string();
+
+        let youpi = generate_multi_tweets(&tweet_content);
+
+        assert_eq!(None, youpi);
+    }
+
+    #[test]
+    fn test_generate_multi_tweets_to_some() {
+        let tweet_content = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ipsum dolor sit amet consectetur adipiscing elit pellentesque. Pharetra pharetra massa massa ultricies mi quis hendrerit dolor. Mauris nunc congue nisi vitae. Scelerisque varius morbi enim nunc faucibus a pellentesque sit amet. Morbi leo urna molestie at elementum. Tristique et egestas quis ipsum suspendisse ultrices gravida dictum fusce. Amet porttitor eget dolor morbi.".to_string();
+
+        let youpi = generate_multi_tweets(&tweet_content);
+
+        let first_half = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ipsum dolor sit amet consectetur adipiscing elit pellentesque. Pharetra pharetra massa massa ultricies mi quis hendrerit dolor.".to_string();
+        let second_half = "Mauris nunc congue nisi vitae. Scelerisque varius morbi enim nunc faucibus a pellentesque sit amet. Morbi leo urna molestie at elementum. Tristique et egestas quis ipsum suspendisse ultrices gravida dictum fusce. Amet porttitor eget dolor morbi.".to_string();
+
+        assert_eq!(youpi, Some((first_half, second_half)));
+    }
+
+    #[test]
+    fn test_strip_mastodon_tags() {
+        let tags = vec![
+            Tag {
+                name: "putaclic".to_string(),
+                url: "https://m.nintendojo.fr/tags/putaclic".to_string(),
+            },
+            Tag {
+                name: "tamerelol".to_string(),
+                url: "https://m.nintendojo.fr/tags/tamerelol".to_string(),
+            },
+            Tag {
+                name: "JeFaisNawakEnCamelCase".to_string(),
+                url: "https://m.nintendojo.fr/tags/jefaisnawakencamelcase".to_string(),
+            },
+        ];
+
+        let mut content =
+            "C’est super ça ! #putaclic #TAMERELOL #JeFaisNawakEnCamelCase".to_string();
+        let sample = "C’est super ça ! ".to_string();
+
+        strip_mastodon_tags(&mut content, &tags).unwrap();
+
+        assert_eq!(content, sample);
+    }
+
+    #[test]
+    fn test_strip_everything() {
+        let content = "<p>Ce soir à 21h, c&#39;est le Dojobar ! Au programme ce soir, une rétrospective sur la série Mario &amp; Luigi.<br />Comme d&#39;hab, le Twitch sera ici : <a href=\"https://twitch.tv/nintendojofr\" target=\"_blank\" rel=\"nofollow noopener noreferrer\" translate=\"no\"><span class=\"invisible\">https://</span><span class=\"\">twitch.tv/nintendojofr</span><span class=\"invisible\"></span></a><br />Ou juste l&#39;audio là : <a href=\"https://nintendojo.fr/dojobar\" target=\"_blank\" rel=\"nofollow noopener noreferrer\" translate=\"no\"><span class=\"invisible\">https://</span><span class=\"\">nintendojo.fr/dojobar</span><span class=\"invisible\"></span></a><br />A toute !</p>";
+        let expected_result = "Ce soir à 21h, c'est le Dojobar ! Au programme ce soir, une rétrospective sur la série Mario & Luigi.\nComme d'hab, le Twitch sera ici : https://twitch.tv/nintendojofr\nOu juste l'audio là : https://nintendojo.fr/dojobar\nA toute !".to_string();
+        let result = strip_everything(content, &vec![]).unwrap();
+
+        assert_eq!(result, expected_result);
+    }
+}
Author	SHA1	Message	Date
VC	b1aed34f3c	Merge branch '3-cut-toot-in-half-when-they-re-too-big' into 'main' Cut toot in half Closes #3 See merge request veretcle/oolatoocs!5	2023-11-20 14:53:19 +00:00
VC	e8bde4c779	feat: move media generation list to twitter.rs to avoid clutter	2023-11-20 15:32:02 +01:00
VC	80946ac131	chore: cargo update	2023-11-20 15:32:02 +01:00
VC	87b0567b59	feat: split toot into 2 tweets when necessary	2023-11-20 15:32:02 +01:00
VC	b6f87e829f	Merge branch '2-find-a-way-to-remove-dissolve' into 'main' feat: remove dissolve + add simpler html tag stripper + html entities Closes #2 See merge request veretcle/oolatoocs!4	2023-11-17 19:30:36 +00:00
VC	6fccbf8d16	feat: remove dissolve + add simpler html tag stripper + html entities	2023-11-17 20:08:07 +01:00
VC	1fdea7f69d	Merge branch 'parallel_medias' into 'main' feat: async upload of medias See merge request veretcle/oolatoocs!3	2023-11-16 08:34:56 +00:00
VC	b73d6340c9	feat: async upload of medias	2023-11-15 15:20:03 +01:00