import os
import shutil
import random
# Define dataset paths
source_images = r"C:\dataset_example\images" # Folder containing all images
source_labels = r"C:\dataset_example\labels" # Folder containing all labels
output_dir = r"C:\dataset_example\split" # Folder where images and labels will be organized
# Split ratios
split_ratio = {"train": 0.7, "val": 0.15, "test": 0.15}
# Ensure output folders exist
for split in ["train", "val", "test"]:
os.makedirs(os.path.join(output_dir, "images", split), exist_ok=True)
os.makedirs(os.path.join(output_dir, "labels", split), exist_ok=True)
# Get all image filenames (without extensions)
image_filenames = [f for f in os.listdir(source_images) if f.endswith(('.jpg', '.png', '.jpeg'))]
random.shuffle(image_filenames)
# Compute split sizes
total = len(image_filenames)
train_count = int(total * split_ratio["train"])
val_count = int(total * split_ratio["val"])
# Assign each image to a split
for i, img_name in enumerate(image_filenames):
if i < train_count:
split = "train"
elif i < train_count + val_count:
split = "val"
else:
split = "test"
# Define source and destination paths
img_src = os.path.join(source_images, img_name)
lbl_src = os.path.join(source_labels, os.path.splitext(img_name)[0] + ".txt")
img_dest = os.path.join(output_dir, "images", split, img_name)
lbl_dest = os.path.join(output_dir, "labels", split, os.path.splitext(img_name)[0] + ".txt")
# Copy image
shutil.copy(img_src, img_dest)
# Copy label if it exists
if os.path.exists(lbl_src):
shutil.copy(lbl_src, lbl_dest)
print("Dataset successfully split into train, val, and test sets.")