# Use current directory if zip_dir is not specified if zip_dir isNone: zip_dir = os.getcwd()
for index, row in df.iterrows(): # Create a directory for each row to store downloaded files row_dir = os.path.join(zip_dir, f"row_{index}") os.makedirs(row_dir, exist_ok=True)
# Download each file in link_columns for col in link_columns: url = row[col] filename = os.path.join(row_dir, url.split("/")[-1]) response = requests.get(url) withopen(filename, 'wb') as f: f.write(response.content)
# Create a zip file for each row zip_filename = "_".join([str(row[col]) for col in name_columns]) + ".zip" zip_path = os.path.join(zip_dir, zip_filename) with ZipFile(zip_path, 'w') as zipf: for foldername, subfolders, filenames in os.walk(row_dir): for filename in filenames: zipf.write(os.path.join(foldername, filename), arcname = filename)
# Clean up downloaded files for filename in os.listdir(row_dir): os.remove(os.path.join(row_dir, filename)) os.rmdir(row_dir)
import os import requests import pandas as pd from zipfile import ZipFile from concurrent.futures import ThreadPoolExecutor
defdownload_and_zip(row, link_columns, name_columns, zip_dir): # Create a directory for each row to store downloaded files row_dir = os.path.join(zip_dir, f"row_{row.name}") os.makedirs(row_dir, exist_ok=True)
# Download each file in link_columns for col in link_columns: url = row[col] filename = os.path.join(row_dir, url.split("/")[-1]) response = requests.get(url) withopen(filename, 'wb') as f: f.write(response.content)
# Create a zip file for each row zip_filename = "_".join([str(row[col]) for col in name_columns]) + ".zip" zip_path = os.path.join(zip_dir, zip_filename) with ZipFile(zip_path, 'w') as zipf: for foldername, subfolders, filenames in os.walk(row_dir): for filename in filenames: zipf.write(os.path.join(foldername, filename), arcname = filename)
# Clean up downloaded files for filename in os.listdir(row_dir): os.remove(os.path.join(row_dir, filename)) os.rmdir(row_dir)
# Use current directory if zip_dir is not specified if zip_dir isNone: zip_dir = os.getcwd()
with ThreadPoolExecutor(max_workers=max_threads) as executor: futures = [executor.submit(download_and_zip, row, link_columns, name_columns, zip_dir) for _, row in df.iterrows()] for future in futures: future.result()