This commit is contained in:
Villers Krisztián 2026-05-12 16:36:45 +02:00
parent c560579285
commit 50f5a3bb08
4 changed files with 115 additions and 25 deletions

1
Cargo.lock generated
View File

@ -1451,6 +1451,7 @@ dependencies = [
"calamine",
"dialoguer",
"image",
"rayon",
"reqwest",
"rfd",
"rust_xlsxwriter",

View File

@ -7,6 +7,7 @@ edition = "2024"
calamine = "0.31"
dialoguer = "0.11"
image = "0.25"
rayon = "1.10"
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
rfd = "0.15"
rust_xlsxwriter = "0.83"

View File

@ -9,6 +9,7 @@ Then it reads the header row and asks you to map three columns using the same TU
After mapping, it downloads each image from the selected `Url` column, reads image metadata, and writes a new Excel file next to the source workbook as `result_[uuid].xlsx`.
For testing, it currently processes only the first 100 non-empty URL rows.
Image processing runs in parallel workers with timeouts and response-size guards to avoid long hangs.
Output columns:
- `Cikkszám`

View File

@ -1,14 +1,30 @@
use std::{
path::{Path, PathBuf},
time::Duration
};
use calamine::{Data, Range, Reader, open_workbook_auto};
use dialoguer::{Select, theme::ColorfulTheme};
use image::GenericImageView;
use rayon::prelude::*;
use rayon::{ThreadPool, ThreadPoolBuilder};
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use rust_xlsxwriter::Workbook;
use std::{
io::Read,
path::{Path, PathBuf},
sync::{
Arc,
atomic::{AtomicUsize, Ordering},
},
thread,
time::Duration,
};
use uuid::Uuid;
const MAX_TEST_ROWS: usize = 100_000_000;
const MAX_CONCURRENCY: usize = 6;
const REQUEST_TIMEOUT_SECS: u64 = 20;
const CONNECT_TIMEOUT_SECS: u64 = 5;
const MAX_IMAGE_BYTES: usize = 20 * 1024 * 1024;
const MAX_RETRIES: usize = 1;
fn main() {
let Some(file) = rfd::FileDialog::new()
.add_filter("Excel files", &["xls", "xlsx"])
@ -79,11 +95,16 @@ fn main() {
eprintln!("No data rows with URL values were found.");
std::process::exit(1);
}
println!("Processing first {} rows for testing.", input_rows.len());
println!(
"Processing up to {} rows for testing (found {}).",
MAX_TEST_ROWS,
input_rows.len()
);
let client = match Client::builder()
.user_agent("o8_pics_size/0.1")
.timeout(Duration::from_secs(45))
.connect_timeout(Duration::from_secs(CONNECT_TIMEOUT_SECS))
.timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
.build()
{
Ok(client) => client,
@ -93,11 +114,14 @@ fn main() {
}
};
let mut output_rows = Vec::new();
let thread_pool = build_thread_pool(MAX_CONCURRENCY);
let progress = Arc::new(AtomicUsize::new(0));
let total = input_rows.len();
for (index, row) in input_rows.iter().enumerate() {
println!("[{}/{}] Fetching {}", index + 1, total, row.url);
let metadata = match fetch_image_metadata(&client, &row.url) {
let output_rows = thread_pool.install(|| {
input_rows
.par_iter()
.map(|row| {
let metadata = match fetch_image_metadata_with_retry(&client, &row.url) {
Ok(metadata) => Some(metadata),
Err(err) => {
eprintln!("Failed to fetch '{}': {}", row.url, err);
@ -105,13 +129,18 @@ fn main() {
}
};
output_rows.push(OutputRow {
let completed = progress.fetch_add(1, Ordering::Relaxed) + 1;
println!("[{}/{}] Processed {}", completed, total, row.url);
OutputRow {
cikkszam: row.cikkszam.clone(),
sequence: row.sequence.clone(),
url: row.url.clone(),
metadata,
});
}
})
.collect::<Vec<_>>()
});
let output_path = build_output_path(&file);
if let Err(err) = write_results_excel(&output_path, &output_rows) {
@ -262,6 +291,10 @@ fn collect_input_rows(
sequence: cell_string(row.get(sequence_idx)),
url,
});
if rows.len() >= MAX_TEST_ROWS {
break;
}
}
rows
@ -274,6 +307,24 @@ fn cell_string(cell: Option<&Data>) -> String {
}
}
fn fetch_image_metadata_with_retry(client: &Client, url: &str) -> Result<ImageMetadata, String> {
let mut last_error: Option<String> = None;
for attempt in 0..=MAX_RETRIES {
match fetch_image_metadata(client, url) {
Ok(metadata) => return Ok(metadata),
Err(err) => {
last_error = Some(err);
if attempt < MAX_RETRIES {
thread::sleep(Duration::from_millis(350 * (attempt + 1) as u64));
}
}
}
}
Err(last_error.unwrap_or_else(|| "unknown error".to_string()))
}
fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, String> {
let response = client
.get(url)
@ -282,9 +333,35 @@ fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, Str
let response = response
.error_for_status()
.map_err(|err| format!("HTTP error: {}", err))?;
let bytes = response
.bytes()
let content_type = response
.headers()
.get(CONTENT_TYPE)
.and_then(|value| value.to_str().ok())
.unwrap_or("");
if !content_type.is_empty() && !content_type.starts_with("image/") {
return Err(format!("content type '{}' is not an image", content_type));
}
if let Some(length) = response.content_length()
&& length > MAX_IMAGE_BYTES as u64
{
return Err(format!(
"image too large ({} bytes > {} bytes limit)",
length, MAX_IMAGE_BYTES
));
}
let mut bytes = Vec::new();
let mut limited_reader = response.take((MAX_IMAGE_BYTES + 1) as u64);
limited_reader
.read_to_end(&mut bytes)
.map_err(|err| format!("failed to read response body: {}", err))?;
if bytes.len() > MAX_IMAGE_BYTES {
return Err(format!(
"image exceeded {} bytes while downloading",
MAX_IMAGE_BYTES
));
}
let image = image::load_from_memory(&bytes)
.map_err(|err| format!("unable to decode image bytes: {}", err))?;
@ -302,6 +379,16 @@ fn build_output_path(input_path: &Path) -> PathBuf {
input_path.with_file_name(format!("result_{}.xlsx", Uuid::new_v4()))
}
fn build_thread_pool(max_threads: usize) -> ThreadPool {
match ThreadPoolBuilder::new().num_threads(max_threads).build() {
Ok(pool) => pool,
Err(err) => {
eprintln!("Failed to build worker pool: {}", err);
std::process::exit(1);
}
}
}
fn write_results_excel(path: &Path, rows: &[OutputRow]) -> Result<(), String> {
let mut workbook = Workbook::new();
let worksheet = workbook.add_worksheet();