This commit is contained in:
Villers Krisztián 2026-05-12 16:36:45 +02:00
parent c560579285
commit 50f5a3bb08
4 changed files with 115 additions and 25 deletions

1
Cargo.lock generated
View File

@ -1451,6 +1451,7 @@ dependencies = [
"calamine", "calamine",
"dialoguer", "dialoguer",
"image", "image",
"rayon",
"reqwest", "reqwest",
"rfd", "rfd",
"rust_xlsxwriter", "rust_xlsxwriter",

View File

@ -7,6 +7,7 @@ edition = "2024"
calamine = "0.31" calamine = "0.31"
dialoguer = "0.11" dialoguer = "0.11"
image = "0.25" image = "0.25"
rayon = "1.10"
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] } reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
rfd = "0.15" rfd = "0.15"
rust_xlsxwriter = "0.83" rust_xlsxwriter = "0.83"

View File

@ -9,6 +9,7 @@ Then it reads the header row and asks you to map three columns using the same TU
After mapping, it downloads each image from the selected `Url` column, reads image metadata, and writes a new Excel file next to the source workbook as `result_[uuid].xlsx`. After mapping, it downloads each image from the selected `Url` column, reads image metadata, and writes a new Excel file next to the source workbook as `result_[uuid].xlsx`.
For testing, it currently processes only the first 100 non-empty URL rows. For testing, it currently processes only the first 100 non-empty URL rows.
Image processing runs in parallel workers with timeouts and response-size guards to avoid long hangs.
Output columns: Output columns:
- `Cikkszám` - `Cikkszám`

View File

@ -1,14 +1,30 @@
use std::{
path::{Path, PathBuf},
time::Duration
};
use calamine::{Data, Range, Reader, open_workbook_auto}; use calamine::{Data, Range, Reader, open_workbook_auto};
use dialoguer::{Select, theme::ColorfulTheme}; use dialoguer::{Select, theme::ColorfulTheme};
use image::GenericImageView; use image::GenericImageView;
use rayon::prelude::*;
use rayon::{ThreadPool, ThreadPoolBuilder};
use reqwest::blocking::Client; use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use rust_xlsxwriter::Workbook; use rust_xlsxwriter::Workbook;
use std::{
io::Read,
path::{Path, PathBuf},
sync::{
Arc,
atomic::{AtomicUsize, Ordering},
},
thread,
time::Duration,
};
use uuid::Uuid; use uuid::Uuid;
const MAX_TEST_ROWS: usize = 100_000_000;
const MAX_CONCURRENCY: usize = 6;
const REQUEST_TIMEOUT_SECS: u64 = 20;
const CONNECT_TIMEOUT_SECS: u64 = 5;
const MAX_IMAGE_BYTES: usize = 20 * 1024 * 1024;
const MAX_RETRIES: usize = 1;
fn main() { fn main() {
let Some(file) = rfd::FileDialog::new() let Some(file) = rfd::FileDialog::new()
.add_filter("Excel files", &["xls", "xlsx"]) .add_filter("Excel files", &["xls", "xlsx"])
@ -79,11 +95,16 @@ fn main() {
eprintln!("No data rows with URL values were found."); eprintln!("No data rows with URL values were found.");
std::process::exit(1); std::process::exit(1);
} }
println!("Processing first {} rows for testing.", input_rows.len()); println!(
"Processing up to {} rows for testing (found {}).",
MAX_TEST_ROWS,
input_rows.len()
);
let client = match Client::builder() let client = match Client::builder()
.user_agent("o8_pics_size/0.1") .user_agent("o8_pics_size/0.1")
.timeout(Duration::from_secs(45)) .connect_timeout(Duration::from_secs(CONNECT_TIMEOUT_SECS))
.timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
.build() .build()
{ {
Ok(client) => client, Ok(client) => client,
@ -93,11 +114,14 @@ fn main() {
} }
}; };
let mut output_rows = Vec::new(); let thread_pool = build_thread_pool(MAX_CONCURRENCY);
let progress = Arc::new(AtomicUsize::new(0));
let total = input_rows.len(); let total = input_rows.len();
for (index, row) in input_rows.iter().enumerate() { let output_rows = thread_pool.install(|| {
println!("[{}/{}] Fetching {}", index + 1, total, row.url); input_rows
let metadata = match fetch_image_metadata(&client, &row.url) { .par_iter()
.map(|row| {
let metadata = match fetch_image_metadata_with_retry(&client, &row.url) {
Ok(metadata) => Some(metadata), Ok(metadata) => Some(metadata),
Err(err) => { Err(err) => {
eprintln!("Failed to fetch '{}': {}", row.url, err); eprintln!("Failed to fetch '{}': {}", row.url, err);
@ -105,13 +129,18 @@ fn main() {
} }
}; };
output_rows.push(OutputRow { let completed = progress.fetch_add(1, Ordering::Relaxed) + 1;
println!("[{}/{}] Processed {}", completed, total, row.url);
OutputRow {
cikkszam: row.cikkszam.clone(), cikkszam: row.cikkszam.clone(),
sequence: row.sequence.clone(), sequence: row.sequence.clone(),
url: row.url.clone(), url: row.url.clone(),
metadata, metadata,
});
} }
})
.collect::<Vec<_>>()
});
let output_path = build_output_path(&file); let output_path = build_output_path(&file);
if let Err(err) = write_results_excel(&output_path, &output_rows) { if let Err(err) = write_results_excel(&output_path, &output_rows) {
@ -262,6 +291,10 @@ fn collect_input_rows(
sequence: cell_string(row.get(sequence_idx)), sequence: cell_string(row.get(sequence_idx)),
url, url,
}); });
if rows.len() >= MAX_TEST_ROWS {
break;
}
} }
rows rows
@ -274,6 +307,24 @@ fn cell_string(cell: Option<&Data>) -> String {
} }
} }
fn fetch_image_metadata_with_retry(client: &Client, url: &str) -> Result<ImageMetadata, String> {
let mut last_error: Option<String> = None;
for attempt in 0..=MAX_RETRIES {
match fetch_image_metadata(client, url) {
Ok(metadata) => return Ok(metadata),
Err(err) => {
last_error = Some(err);
if attempt < MAX_RETRIES {
thread::sleep(Duration::from_millis(350 * (attempt + 1) as u64));
}
}
}
}
Err(last_error.unwrap_or_else(|| "unknown error".to_string()))
}
fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, String> { fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, String> {
let response = client let response = client
.get(url) .get(url)
@ -282,9 +333,35 @@ fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, Str
let response = response let response = response
.error_for_status() .error_for_status()
.map_err(|err| format!("HTTP error: {}", err))?; .map_err(|err| format!("HTTP error: {}", err))?;
let bytes = response let content_type = response
.bytes() .headers()
.get(CONTENT_TYPE)
.and_then(|value| value.to_str().ok())
.unwrap_or("");
if !content_type.is_empty() && !content_type.starts_with("image/") {
return Err(format!("content type '{}' is not an image", content_type));
}
if let Some(length) = response.content_length()
&& length > MAX_IMAGE_BYTES as u64
{
return Err(format!(
"image too large ({} bytes > {} bytes limit)",
length, MAX_IMAGE_BYTES
));
}
let mut bytes = Vec::new();
let mut limited_reader = response.take((MAX_IMAGE_BYTES + 1) as u64);
limited_reader
.read_to_end(&mut bytes)
.map_err(|err| format!("failed to read response body: {}", err))?; .map_err(|err| format!("failed to read response body: {}", err))?;
if bytes.len() > MAX_IMAGE_BYTES {
return Err(format!(
"image exceeded {} bytes while downloading",
MAX_IMAGE_BYTES
));
}
let image = image::load_from_memory(&bytes) let image = image::load_from_memory(&bytes)
.map_err(|err| format!("unable to decode image bytes: {}", err))?; .map_err(|err| format!("unable to decode image bytes: {}", err))?;
@ -302,6 +379,16 @@ fn build_output_path(input_path: &Path) -> PathBuf {
input_path.with_file_name(format!("result_{}.xlsx", Uuid::new_v4())) input_path.with_file_name(format!("result_{}.xlsx", Uuid::new_v4()))
} }
fn build_thread_pool(max_threads: usize) -> ThreadPool {
match ThreadPoolBuilder::new().num_threads(max_threads).build() {
Ok(pool) => pool,
Err(err) => {
eprintln!("Failed to build worker pool: {}", err);
std::process::exit(1);
}
}
}
fn write_results_excel(path: &Path, rows: &[OutputRow]) -> Result<(), String> { fn write_results_excel(path: &Path, rows: &[OutputRow]) -> Result<(), String> {
let mut workbook = Workbook::new(); let mut workbook = Workbook::new();
let worksheet = workbook.add_worksheet(); let worksheet = workbook.add_worksheet();