From 50f5a3bb08e9dfdd3d1e5773bc8a0c3296ae5a6f Mon Sep 17 00:00:00 2001 From: kvillers Date: Tue, 12 May 2026 16:36:45 +0200 Subject: [PATCH] ok --- Cargo.lock | 1 + Cargo.toml | 1 + README.md | 1 + src/main.rs | 137 ++++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 115 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 47a2936..9791140 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1451,6 +1451,7 @@ dependencies = [ "calamine", "dialoguer", "image", + "rayon", "reqwest", "rfd", "rust_xlsxwriter", diff --git a/Cargo.toml b/Cargo.toml index 84c61d7..63b42bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ edition = "2024" calamine = "0.31" dialoguer = "0.11" image = "0.25" +rayon = "1.10" reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] } rfd = "0.15" rust_xlsxwriter = "0.83" diff --git a/README.md b/README.md index 1d76492..792f0dd 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Then it reads the header row and asks you to map three columns using the same TU After mapping, it downloads each image from the selected `Url` column, reads image metadata, and writes a new Excel file next to the source workbook as `result_[uuid].xlsx`. For testing, it currently processes only the first 100 non-empty URL rows. +Image processing runs in parallel workers with timeouts and response-size guards to avoid long hangs. Output columns: - `Cikkszám` diff --git a/src/main.rs b/src/main.rs index 16fad7d..431e3ce 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,14 +1,30 @@ -use std::{ - path::{Path, PathBuf}, - time::Duration -}; use calamine::{Data, Range, Reader, open_workbook_auto}; use dialoguer::{Select, theme::ColorfulTheme}; use image::GenericImageView; +use rayon::prelude::*; +use rayon::{ThreadPool, ThreadPoolBuilder}; use reqwest::blocking::Client; +use reqwest::header::CONTENT_TYPE; use rust_xlsxwriter::Workbook; +use std::{ + io::Read, + path::{Path, PathBuf}, + sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, + }, + thread, + time::Duration, +}; use uuid::Uuid; +const MAX_TEST_ROWS: usize = 100_000_000; +const MAX_CONCURRENCY: usize = 6; +const REQUEST_TIMEOUT_SECS: u64 = 20; +const CONNECT_TIMEOUT_SECS: u64 = 5; +const MAX_IMAGE_BYTES: usize = 20 * 1024 * 1024; +const MAX_RETRIES: usize = 1; + fn main() { let Some(file) = rfd::FileDialog::new() .add_filter("Excel files", &["xls", "xlsx"]) @@ -79,11 +95,16 @@ fn main() { eprintln!("No data rows with URL values were found."); std::process::exit(1); } - println!("Processing first {} rows for testing.", input_rows.len()); + println!( + "Processing up to {} rows for testing (found {}).", + MAX_TEST_ROWS, + input_rows.len() + ); let client = match Client::builder() .user_agent("o8_pics_size/0.1") - .timeout(Duration::from_secs(45)) + .connect_timeout(Duration::from_secs(CONNECT_TIMEOUT_SECS)) + .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS)) .build() { Ok(client) => client, @@ -93,25 +114,33 @@ fn main() { } }; - let mut output_rows = Vec::new(); + let thread_pool = build_thread_pool(MAX_CONCURRENCY); + let progress = Arc::new(AtomicUsize::new(0)); let total = input_rows.len(); - for (index, row) in input_rows.iter().enumerate() { - println!("[{}/{}] Fetching {}", index + 1, total, row.url); - let metadata = match fetch_image_metadata(&client, &row.url) { - Ok(metadata) => Some(metadata), - Err(err) => { - eprintln!("Failed to fetch '{}': {}", row.url, err); - None - } - }; + let output_rows = thread_pool.install(|| { + input_rows + .par_iter() + .map(|row| { + let metadata = match fetch_image_metadata_with_retry(&client, &row.url) { + Ok(metadata) => Some(metadata), + Err(err) => { + eprintln!("Failed to fetch '{}': {}", row.url, err); + None + } + }; - output_rows.push(OutputRow { - cikkszam: row.cikkszam.clone(), - sequence: row.sequence.clone(), - url: row.url.clone(), - metadata, - }); - } + let completed = progress.fetch_add(1, Ordering::Relaxed) + 1; + println!("[{}/{}] Processed {}", completed, total, row.url); + + OutputRow { + cikkszam: row.cikkszam.clone(), + sequence: row.sequence.clone(), + url: row.url.clone(), + metadata, + } + }) + .collect::>() + }); let output_path = build_output_path(&file); if let Err(err) = write_results_excel(&output_path, &output_rows) { @@ -262,6 +291,10 @@ fn collect_input_rows( sequence: cell_string(row.get(sequence_idx)), url, }); + + if rows.len() >= MAX_TEST_ROWS { + break; + } } rows @@ -274,6 +307,24 @@ fn cell_string(cell: Option<&Data>) -> String { } } +fn fetch_image_metadata_with_retry(client: &Client, url: &str) -> Result { + let mut last_error: Option = None; + + for attempt in 0..=MAX_RETRIES { + match fetch_image_metadata(client, url) { + Ok(metadata) => return Ok(metadata), + Err(err) => { + last_error = Some(err); + if attempt < MAX_RETRIES { + thread::sleep(Duration::from_millis(350 * (attempt + 1) as u64)); + } + } + } + } + + Err(last_error.unwrap_or_else(|| "unknown error".to_string())) +} + fn fetch_image_metadata(client: &Client, url: &str) -> Result { let response = client .get(url) @@ -282,9 +333,35 @@ fn fetch_image_metadata(client: &Client, url: &str) -> Result MAX_IMAGE_BYTES as u64 + { + return Err(format!( + "image too large ({} bytes > {} bytes limit)", + length, MAX_IMAGE_BYTES + )); + } + + let mut bytes = Vec::new(); + let mut limited_reader = response.take((MAX_IMAGE_BYTES + 1) as u64); + limited_reader + .read_to_end(&mut bytes) .map_err(|err| format!("failed to read response body: {}", err))?; + if bytes.len() > MAX_IMAGE_BYTES { + return Err(format!( + "image exceeded {} bytes while downloading", + MAX_IMAGE_BYTES + )); + } let image = image::load_from_memory(&bytes) .map_err(|err| format!("unable to decode image bytes: {}", err))?; @@ -302,6 +379,16 @@ fn build_output_path(input_path: &Path) -> PathBuf { input_path.with_file_name(format!("result_{}.xlsx", Uuid::new_v4())) } +fn build_thread_pool(max_threads: usize) -> ThreadPool { + match ThreadPoolBuilder::new().num_threads(max_threads).build() { + Ok(pool) => pool, + Err(err) => { + eprintln!("Failed to build worker pool: {}", err); + std::process::exit(1); + } + } +} + fn write_results_excel(path: &Path, rows: &[OutputRow]) -> Result<(), String> { let mut workbook = Workbook::new(); let worksheet = workbook.add_worksheet();