perf(raw_update.py)!: Don't fetch entries that are already in the database

Testing has resulted in a speed-up of circa 3400% (updates of 1 subscription (which is already fully stored) are now ca. 1 sec from previously 34 sec). BREAKING CHANGE: The extractor hash is now calculated from the `id` and not the `webpage_url` field requiring a complete re-fetch of all stored videos. ```bash $# export your subscriptions: $ yt subs list --urls > subs.txt $# remove the old database $ mv ~/.local/share/yt/videos.sqlite{,.old} $# reimport the subsciptions $ yt subs import subs.txt $# refetch all videos $ yt upadate ```
author: Benedikt Peetz <benedikt.peetz@b-peetz.de> 2024-08-22 14:01:22 +0200
committer: Benedikt Peetz <benedikt.peetz@b-peetz.de> 2024-08-22 14:01:22 +0200
commit: 53f49fa9279ac86944dfdc48f80a5783430632bf (patch)
tree: 6732d9b1f167d4cfe091b30378e1d5c6a8f4a4cb /src/update
parent: build(package): Include python `blake3` dependency (diff)
download: yt-53f49fa9279ac86944dfdc48f80a5783430632bf.tar.gz
yt-53f49fa9279ac86944dfdc48f80a5783430632bf.zip
1 files changed, 10 insertions, 14 deletions
diff --git a/src/update/mod.rs b/src/update/mod.rs
index bdd6c27..119c53c 100644
--- a/src/update/mod.rs
+++ b/src/update/mod.rs
@@ -53,9 +53,15 @@ pub async fn update(
         }
     }
 
+    // We can get away with not having to re-fetch the hashes every time, as the returned video
+    // should not contain duplicates.
+    let hashes = get_all_hashes(app).await?;
+
     let mut child = Command::new("raw_update.py")
         .arg(max_backlog.to_string())
+        .arg(urls.len().to_string())
         .args(&urls)
+        .args(&hashes.iter().map(|haz| haz.to_string()).collect::<Vec<_>>())
         .stdout(Stdio::piped())
         .stderr(Stdio::null())
         .stdin(Stdio::null())
@@ -70,10 +76,6 @@ pub async fn update(
     )
     .lines();
 
-    // We can get away with not having to re-fetch the hashes every time, as the returned video
-    // should not contain duplicates.
-    let hashes = get_all_hashes(app).await?;
-
     while let Some(line) = out.next_line().await? {
         // use tokio::{fs::File, io::AsyncWriteExt};
         // let mut output = File::create("output.json").await?;
@@ -93,7 +95,7 @@ pub async fn update(
 
     let out = child.wait().await?;
     if out.success() {
-        error!("A yt update-once invokation failed for all subscriptions.")
+        error!("The update_raw.py invokation failed for all subscriptions.")
     }
 
     Ok(())
@@ -174,16 +176,11 @@ async fn process_subscription(
         unsmuggle_url(smug_url)?
     };
 
-    let extractor_hash = blake3::hash(url.as_str().as_bytes());
+    let extractor_hash = blake3::hash(unwrap_option!(entry.id).as_bytes());
 
     if hashes.contains(&extractor_hash) {
         // We already stored the video information
-        println!(
-            "(Ignoring duplicated video from: '{}' -> '{}')",
-            sub.name,
-            unwrap_option!(entry.title)
-        );
-        return Ok(());
+        unreachable!("The python update script should have never provided us a duplicated video");
     } else {
         let video = Video {
             cache_path: None,
@@ -203,7 +200,6 @@ async fn process_subscription(
 
         println!("{}", video.to_color_display());
         add_video(app, video).await?;
+        Ok(())
     }
-
-    Ok(())
 }
author	Benedikt Peetz <benedikt.peetz@b-peetz.de>	2024-08-22 14:01:22 +0200
committer	Benedikt Peetz <benedikt.peetz@b-peetz.de>	2024-08-22 14:01:22 +0200
commit	53f49fa9279ac86944dfdc48f80a5783430632bf (patch)
tree	6732d9b1f167d4cfe091b30378e1d5c6a8f4a4cb /src/update
parent	build(package): Include python `blake3` dependency (diff)
download	yt-53f49fa9279ac86944dfdc48f80a5783430632bf.tar.gz yt-53f49fa9279ac86944dfdc48f80a5783430632bf.zip