该Python脚本用于处理/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json目录下的数据,将其中所有图片等比例缩小(最长边为1024像素),并连同JSON文件一起复制到新目录/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json_small_size中。
功能说明
bash展开代码Pillow>=9.0.0 tqdm>=4.62.0
python展开代码import os
import shutil
import json
from PIL import Image
import multiprocessing
from functools import partial
from tqdm import tqdm
import argparse
# Source and destination paths
SRC_DIR = "/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json"
DEST_DIR = "/ssd/xiedong/vlm-r1-train-tasks-json-ui-docto/tasks_json_small_size"
MAX_SIZE = 1024
def resize_image(img_path, dest_path):
    """Resize image while maintaining aspect ratio so that the longest side is MAX_SIZE"""
    try:
        with Image.open(img_path) as img:
            # Get original dimensions
            width, height = img.size
            
            # Calculate new dimensions
            if width > height:
                new_width = MAX_SIZE
                new_height = int(height * (MAX_SIZE / width))
            else:
                new_height = MAX_SIZE
                new_width = int(width * (MAX_SIZE / height))
            
            # Resize the image
            resized_img = img.resize((new_width, new_height), Image.LANCZOS)
            
            # Save the resized image
            resized_img.save(dest_path, quality=95)
            return True
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        return False
def process_folder(folder_name):
    """Process a single folder, resizing images and copying JSON files"""
    try:
        src_folder = os.path.join(SRC_DIR, folder_name)
        dest_folder = os.path.join(DEST_DIR, folder_name)
        
        # Create destination folder if it doesn't exist
        os.makedirs(dest_folder, exist_ok=True)
        
        # Process each file in the folder
        for filename in os.listdir(src_folder):
            src_file = os.path.join(src_folder, filename)
            dest_file = os.path.join(dest_folder, filename)
            
            # Check if it's an image file
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                resize_image(src_file, dest_file)
            else:
                # Copy non-image files (like JSON)
                shutil.copy2(src_file, dest_file)
        
        return True
    except Exception as e:
        print(f"Error processing folder {folder_name}: {e}")
        return False
def main():
    # Create destination directory if it doesn't exist
    os.makedirs(DEST_DIR, exist_ok=True)
    
    # Get list of all folders in source directory
    folders = [d for d in os.listdir(SRC_DIR) if os.path.isdir(os.path.join(SRC_DIR, d))]
    
    # Use multiprocessing to process folders in parallel
    num_processes = max(1, multiprocessing.cpu_count() - 1)  # Leave one CPU free
    print(f"Processing {len(folders)} folders using {num_processes} processes")
    
    # Process folders in parallel with progress bar
    with multiprocessing.Pool(processes=num_processes) as pool:
        list(tqdm(pool.imap(process_folder, folders), total=len(folders)))
    
    print(f"Processing complete. Resized images and copied files to {DEST_DIR}")
if __name__ == "__main__":
    main() 


本文作者:Dong
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC。本作品采用《知识共享署名-非商业性使用 4.0 国际许可协议》进行许可。您可以在非商业用途下自由转载和修改,但必须注明出处并提供原作者链接。 许可协议。转载请注明出处!