This commit is contained in:
parent
c560579285
commit
50f5a3bb08
|
|
@ -1451,6 +1451,7 @@ dependencies = [
|
|||
"calamine",
|
||||
"dialoguer",
|
||||
"image",
|
||||
"rayon",
|
||||
"reqwest",
|
||||
"rfd",
|
||||
"rust_xlsxwriter",
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ edition = "2024"
|
|||
calamine = "0.31"
|
||||
dialoguer = "0.11"
|
||||
image = "0.25"
|
||||
rayon = "1.10"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
|
||||
rfd = "0.15"
|
||||
rust_xlsxwriter = "0.83"
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ Then it reads the header row and asks you to map three columns using the same TU
|
|||
|
||||
After mapping, it downloads each image from the selected `Url` column, reads image metadata, and writes a new Excel file next to the source workbook as `result_[uuid].xlsx`.
|
||||
For testing, it currently processes only the first 100 non-empty URL rows.
|
||||
Image processing runs in parallel workers with timeouts and response-size guards to avoid long hangs.
|
||||
|
||||
Output columns:
|
||||
- `Cikkszám`
|
||||
|
|
|
|||
137
src/main.rs
137
src/main.rs
|
|
@ -1,14 +1,30 @@
|
|||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
time::Duration
|
||||
};
|
||||
use calamine::{Data, Range, Reader, open_workbook_auto};
|
||||
use dialoguer::{Select, theme::ColorfulTheme};
|
||||
use image::GenericImageView;
|
||||
use rayon::prelude::*;
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
use reqwest::blocking::Client;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use rust_xlsxwriter::Workbook;
|
||||
use std::{
|
||||
io::Read,
|
||||
path::{Path, PathBuf},
|
||||
sync::{
|
||||
Arc,
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
},
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
use uuid::Uuid;
|
||||
|
||||
const MAX_TEST_ROWS: usize = 100_000_000;
|
||||
const MAX_CONCURRENCY: usize = 6;
|
||||
const REQUEST_TIMEOUT_SECS: u64 = 20;
|
||||
const CONNECT_TIMEOUT_SECS: u64 = 5;
|
||||
const MAX_IMAGE_BYTES: usize = 20 * 1024 * 1024;
|
||||
const MAX_RETRIES: usize = 1;
|
||||
|
||||
fn main() {
|
||||
let Some(file) = rfd::FileDialog::new()
|
||||
.add_filter("Excel files", &["xls", "xlsx"])
|
||||
|
|
@ -79,11 +95,16 @@ fn main() {
|
|||
eprintln!("No data rows with URL values were found.");
|
||||
std::process::exit(1);
|
||||
}
|
||||
println!("Processing first {} rows for testing.", input_rows.len());
|
||||
println!(
|
||||
"Processing up to {} rows for testing (found {}).",
|
||||
MAX_TEST_ROWS,
|
||||
input_rows.len()
|
||||
);
|
||||
|
||||
let client = match Client::builder()
|
||||
.user_agent("o8_pics_size/0.1")
|
||||
.timeout(Duration::from_secs(45))
|
||||
.connect_timeout(Duration::from_secs(CONNECT_TIMEOUT_SECS))
|
||||
.timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
|
||||
.build()
|
||||
{
|
||||
Ok(client) => client,
|
||||
|
|
@ -93,25 +114,33 @@ fn main() {
|
|||
}
|
||||
};
|
||||
|
||||
let mut output_rows = Vec::new();
|
||||
let thread_pool = build_thread_pool(MAX_CONCURRENCY);
|
||||
let progress = Arc::new(AtomicUsize::new(0));
|
||||
let total = input_rows.len();
|
||||
for (index, row) in input_rows.iter().enumerate() {
|
||||
println!("[{}/{}] Fetching {}", index + 1, total, row.url);
|
||||
let metadata = match fetch_image_metadata(&client, &row.url) {
|
||||
Ok(metadata) => Some(metadata),
|
||||
Err(err) => {
|
||||
eprintln!("Failed to fetch '{}': {}", row.url, err);
|
||||
None
|
||||
}
|
||||
};
|
||||
let output_rows = thread_pool.install(|| {
|
||||
input_rows
|
||||
.par_iter()
|
||||
.map(|row| {
|
||||
let metadata = match fetch_image_metadata_with_retry(&client, &row.url) {
|
||||
Ok(metadata) => Some(metadata),
|
||||
Err(err) => {
|
||||
eprintln!("Failed to fetch '{}': {}", row.url, err);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
output_rows.push(OutputRow {
|
||||
cikkszam: row.cikkszam.clone(),
|
||||
sequence: row.sequence.clone(),
|
||||
url: row.url.clone(),
|
||||
metadata,
|
||||
});
|
||||
}
|
||||
let completed = progress.fetch_add(1, Ordering::Relaxed) + 1;
|
||||
println!("[{}/{}] Processed {}", completed, total, row.url);
|
||||
|
||||
OutputRow {
|
||||
cikkszam: row.cikkszam.clone(),
|
||||
sequence: row.sequence.clone(),
|
||||
url: row.url.clone(),
|
||||
metadata,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
|
||||
let output_path = build_output_path(&file);
|
||||
if let Err(err) = write_results_excel(&output_path, &output_rows) {
|
||||
|
|
@ -262,6 +291,10 @@ fn collect_input_rows(
|
|||
sequence: cell_string(row.get(sequence_idx)),
|
||||
url,
|
||||
});
|
||||
|
||||
if rows.len() >= MAX_TEST_ROWS {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
rows
|
||||
|
|
@ -274,6 +307,24 @@ fn cell_string(cell: Option<&Data>) -> String {
|
|||
}
|
||||
}
|
||||
|
||||
fn fetch_image_metadata_with_retry(client: &Client, url: &str) -> Result<ImageMetadata, String> {
|
||||
let mut last_error: Option<String> = None;
|
||||
|
||||
for attempt in 0..=MAX_RETRIES {
|
||||
match fetch_image_metadata(client, url) {
|
||||
Ok(metadata) => return Ok(metadata),
|
||||
Err(err) => {
|
||||
last_error = Some(err);
|
||||
if attempt < MAX_RETRIES {
|
||||
thread::sleep(Duration::from_millis(350 * (attempt + 1) as u64));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_error.unwrap_or_else(|| "unknown error".to_string()))
|
||||
}
|
||||
|
||||
fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, String> {
|
||||
let response = client
|
||||
.get(url)
|
||||
|
|
@ -282,9 +333,35 @@ fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, Str
|
|||
let response = response
|
||||
.error_for_status()
|
||||
.map_err(|err| format!("HTTP error: {}", err))?;
|
||||
let bytes = response
|
||||
.bytes()
|
||||
let content_type = response
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|value| value.to_str().ok())
|
||||
.unwrap_or("");
|
||||
if !content_type.is_empty() && !content_type.starts_with("image/") {
|
||||
return Err(format!("content type '{}' is not an image", content_type));
|
||||
}
|
||||
|
||||
if let Some(length) = response.content_length()
|
||||
&& length > MAX_IMAGE_BYTES as u64
|
||||
{
|
||||
return Err(format!(
|
||||
"image too large ({} bytes > {} bytes limit)",
|
||||
length, MAX_IMAGE_BYTES
|
||||
));
|
||||
}
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
let mut limited_reader = response.take((MAX_IMAGE_BYTES + 1) as u64);
|
||||
limited_reader
|
||||
.read_to_end(&mut bytes)
|
||||
.map_err(|err| format!("failed to read response body: {}", err))?;
|
||||
if bytes.len() > MAX_IMAGE_BYTES {
|
||||
return Err(format!(
|
||||
"image exceeded {} bytes while downloading",
|
||||
MAX_IMAGE_BYTES
|
||||
));
|
||||
}
|
||||
|
||||
let image = image::load_from_memory(&bytes)
|
||||
.map_err(|err| format!("unable to decode image bytes: {}", err))?;
|
||||
|
|
@ -302,6 +379,16 @@ fn build_output_path(input_path: &Path) -> PathBuf {
|
|||
input_path.with_file_name(format!("result_{}.xlsx", Uuid::new_v4()))
|
||||
}
|
||||
|
||||
fn build_thread_pool(max_threads: usize) -> ThreadPool {
|
||||
match ThreadPoolBuilder::new().num_threads(max_threads).build() {
|
||||
Ok(pool) => pool,
|
||||
Err(err) => {
|
||||
eprintln!("Failed to build worker pool: {}", err);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn write_results_excel(path: &Path, rows: &[OutputRow]) -> Result<(), String> {
|
||||
let mut workbook = Workbook::new();
|
||||
let worksheet = workbook.add_worksheet();
|
||||
|
|
|
|||
Loading…
Reference in New Issue