This commit is contained in:
parent
c560579285
commit
50f5a3bb08
|
|
@ -1451,6 +1451,7 @@ dependencies = [
|
||||||
"calamine",
|
"calamine",
|
||||||
"dialoguer",
|
"dialoguer",
|
||||||
"image",
|
"image",
|
||||||
|
"rayon",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"rfd",
|
"rfd",
|
||||||
"rust_xlsxwriter",
|
"rust_xlsxwriter",
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ edition = "2024"
|
||||||
calamine = "0.31"
|
calamine = "0.31"
|
||||||
dialoguer = "0.11"
|
dialoguer = "0.11"
|
||||||
image = "0.25"
|
image = "0.25"
|
||||||
|
rayon = "1.10"
|
||||||
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
|
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
|
||||||
rfd = "0.15"
|
rfd = "0.15"
|
||||||
rust_xlsxwriter = "0.83"
|
rust_xlsxwriter = "0.83"
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ Then it reads the header row and asks you to map three columns using the same TU
|
||||||
|
|
||||||
After mapping, it downloads each image from the selected `Url` column, reads image metadata, and writes a new Excel file next to the source workbook as `result_[uuid].xlsx`.
|
After mapping, it downloads each image from the selected `Url` column, reads image metadata, and writes a new Excel file next to the source workbook as `result_[uuid].xlsx`.
|
||||||
For testing, it currently processes only the first 100 non-empty URL rows.
|
For testing, it currently processes only the first 100 non-empty URL rows.
|
||||||
|
Image processing runs in parallel workers with timeouts and response-size guards to avoid long hangs.
|
||||||
|
|
||||||
Output columns:
|
Output columns:
|
||||||
- `Cikkszám`
|
- `Cikkszám`
|
||||||
|
|
|
||||||
137
src/main.rs
137
src/main.rs
|
|
@ -1,14 +1,30 @@
|
||||||
use std::{
|
|
||||||
path::{Path, PathBuf},
|
|
||||||
time::Duration
|
|
||||||
};
|
|
||||||
use calamine::{Data, Range, Reader, open_workbook_auto};
|
use calamine::{Data, Range, Reader, open_workbook_auto};
|
||||||
use dialoguer::{Select, theme::ColorfulTheme};
|
use dialoguer::{Select, theme::ColorfulTheme};
|
||||||
use image::GenericImageView;
|
use image::GenericImageView;
|
||||||
|
use rayon::prelude::*;
|
||||||
|
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||||
use reqwest::blocking::Client;
|
use reqwest::blocking::Client;
|
||||||
|
use reqwest::header::CONTENT_TYPE;
|
||||||
use rust_xlsxwriter::Workbook;
|
use rust_xlsxwriter::Workbook;
|
||||||
|
use std::{
|
||||||
|
io::Read,
|
||||||
|
path::{Path, PathBuf},
|
||||||
|
sync::{
|
||||||
|
Arc,
|
||||||
|
atomic::{AtomicUsize, Ordering},
|
||||||
|
},
|
||||||
|
thread,
|
||||||
|
time::Duration,
|
||||||
|
};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
const MAX_TEST_ROWS: usize = 100_000_000;
|
||||||
|
const MAX_CONCURRENCY: usize = 6;
|
||||||
|
const REQUEST_TIMEOUT_SECS: u64 = 20;
|
||||||
|
const CONNECT_TIMEOUT_SECS: u64 = 5;
|
||||||
|
const MAX_IMAGE_BYTES: usize = 20 * 1024 * 1024;
|
||||||
|
const MAX_RETRIES: usize = 1;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let Some(file) = rfd::FileDialog::new()
|
let Some(file) = rfd::FileDialog::new()
|
||||||
.add_filter("Excel files", &["xls", "xlsx"])
|
.add_filter("Excel files", &["xls", "xlsx"])
|
||||||
|
|
@ -79,11 +95,16 @@ fn main() {
|
||||||
eprintln!("No data rows with URL values were found.");
|
eprintln!("No data rows with URL values were found.");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
}
|
}
|
||||||
println!("Processing first {} rows for testing.", input_rows.len());
|
println!(
|
||||||
|
"Processing up to {} rows for testing (found {}).",
|
||||||
|
MAX_TEST_ROWS,
|
||||||
|
input_rows.len()
|
||||||
|
);
|
||||||
|
|
||||||
let client = match Client::builder()
|
let client = match Client::builder()
|
||||||
.user_agent("o8_pics_size/0.1")
|
.user_agent("o8_pics_size/0.1")
|
||||||
.timeout(Duration::from_secs(45))
|
.connect_timeout(Duration::from_secs(CONNECT_TIMEOUT_SECS))
|
||||||
|
.timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
|
||||||
.build()
|
.build()
|
||||||
{
|
{
|
||||||
Ok(client) => client,
|
Ok(client) => client,
|
||||||
|
|
@ -93,25 +114,33 @@ fn main() {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut output_rows = Vec::new();
|
let thread_pool = build_thread_pool(MAX_CONCURRENCY);
|
||||||
|
let progress = Arc::new(AtomicUsize::new(0));
|
||||||
let total = input_rows.len();
|
let total = input_rows.len();
|
||||||
for (index, row) in input_rows.iter().enumerate() {
|
let output_rows = thread_pool.install(|| {
|
||||||
println!("[{}/{}] Fetching {}", index + 1, total, row.url);
|
input_rows
|
||||||
let metadata = match fetch_image_metadata(&client, &row.url) {
|
.par_iter()
|
||||||
Ok(metadata) => Some(metadata),
|
.map(|row| {
|
||||||
Err(err) => {
|
let metadata = match fetch_image_metadata_with_retry(&client, &row.url) {
|
||||||
eprintln!("Failed to fetch '{}': {}", row.url, err);
|
Ok(metadata) => Some(metadata),
|
||||||
None
|
Err(err) => {
|
||||||
}
|
eprintln!("Failed to fetch '{}': {}", row.url, err);
|
||||||
};
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
output_rows.push(OutputRow {
|
let completed = progress.fetch_add(1, Ordering::Relaxed) + 1;
|
||||||
cikkszam: row.cikkszam.clone(),
|
println!("[{}/{}] Processed {}", completed, total, row.url);
|
||||||
sequence: row.sequence.clone(),
|
|
||||||
url: row.url.clone(),
|
OutputRow {
|
||||||
metadata,
|
cikkszam: row.cikkszam.clone(),
|
||||||
});
|
sequence: row.sequence.clone(),
|
||||||
}
|
url: row.url.clone(),
|
||||||
|
metadata,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
});
|
||||||
|
|
||||||
let output_path = build_output_path(&file);
|
let output_path = build_output_path(&file);
|
||||||
if let Err(err) = write_results_excel(&output_path, &output_rows) {
|
if let Err(err) = write_results_excel(&output_path, &output_rows) {
|
||||||
|
|
@ -262,6 +291,10 @@ fn collect_input_rows(
|
||||||
sequence: cell_string(row.get(sequence_idx)),
|
sequence: cell_string(row.get(sequence_idx)),
|
||||||
url,
|
url,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if rows.len() >= MAX_TEST_ROWS {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rows
|
rows
|
||||||
|
|
@ -274,6 +307,24 @@ fn cell_string(cell: Option<&Data>) -> String {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn fetch_image_metadata_with_retry(client: &Client, url: &str) -> Result<ImageMetadata, String> {
|
||||||
|
let mut last_error: Option<String> = None;
|
||||||
|
|
||||||
|
for attempt in 0..=MAX_RETRIES {
|
||||||
|
match fetch_image_metadata(client, url) {
|
||||||
|
Ok(metadata) => return Ok(metadata),
|
||||||
|
Err(err) => {
|
||||||
|
last_error = Some(err);
|
||||||
|
if attempt < MAX_RETRIES {
|
||||||
|
thread::sleep(Duration::from_millis(350 * (attempt + 1) as u64));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(last_error.unwrap_or_else(|| "unknown error".to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, String> {
|
fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, String> {
|
||||||
let response = client
|
let response = client
|
||||||
.get(url)
|
.get(url)
|
||||||
|
|
@ -282,9 +333,35 @@ fn fetch_image_metadata(client: &Client, url: &str) -> Result<ImageMetadata, Str
|
||||||
let response = response
|
let response = response
|
||||||
.error_for_status()
|
.error_for_status()
|
||||||
.map_err(|err| format!("HTTP error: {}", err))?;
|
.map_err(|err| format!("HTTP error: {}", err))?;
|
||||||
let bytes = response
|
let content_type = response
|
||||||
.bytes()
|
.headers()
|
||||||
|
.get(CONTENT_TYPE)
|
||||||
|
.and_then(|value| value.to_str().ok())
|
||||||
|
.unwrap_or("");
|
||||||
|
if !content_type.is_empty() && !content_type.starts_with("image/") {
|
||||||
|
return Err(format!("content type '{}' is not an image", content_type));
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(length) = response.content_length()
|
||||||
|
&& length > MAX_IMAGE_BYTES as u64
|
||||||
|
{
|
||||||
|
return Err(format!(
|
||||||
|
"image too large ({} bytes > {} bytes limit)",
|
||||||
|
length, MAX_IMAGE_BYTES
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
let mut limited_reader = response.take((MAX_IMAGE_BYTES + 1) as u64);
|
||||||
|
limited_reader
|
||||||
|
.read_to_end(&mut bytes)
|
||||||
.map_err(|err| format!("failed to read response body: {}", err))?;
|
.map_err(|err| format!("failed to read response body: {}", err))?;
|
||||||
|
if bytes.len() > MAX_IMAGE_BYTES {
|
||||||
|
return Err(format!(
|
||||||
|
"image exceeded {} bytes while downloading",
|
||||||
|
MAX_IMAGE_BYTES
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
let image = image::load_from_memory(&bytes)
|
let image = image::load_from_memory(&bytes)
|
||||||
.map_err(|err| format!("unable to decode image bytes: {}", err))?;
|
.map_err(|err| format!("unable to decode image bytes: {}", err))?;
|
||||||
|
|
@ -302,6 +379,16 @@ fn build_output_path(input_path: &Path) -> PathBuf {
|
||||||
input_path.with_file_name(format!("result_{}.xlsx", Uuid::new_v4()))
|
input_path.with_file_name(format!("result_{}.xlsx", Uuid::new_v4()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn build_thread_pool(max_threads: usize) -> ThreadPool {
|
||||||
|
match ThreadPoolBuilder::new().num_threads(max_threads).build() {
|
||||||
|
Ok(pool) => pool,
|
||||||
|
Err(err) => {
|
||||||
|
eprintln!("Failed to build worker pool: {}", err);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn write_results_excel(path: &Path, rows: &[OutputRow]) -> Result<(), String> {
|
fn write_results_excel(path: &Path, rows: &[OutputRow]) -> Result<(), String> {
|
||||||
let mut workbook = Workbook::new();
|
let mut workbook = Workbook::new();
|
||||||
let worksheet = workbook.add_worksheet();
|
let worksheet = workbook.add_worksheet();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue