Python 目标检测数据清洗与标注常用程序记录

1 、为一个文件夹的图片改变文件名称

python
展开代码
import os
prefix = input("前缀\n")
startNumber = input("开始的数字\n")
father = r"D:\Desktop\jinggai\jinggai3"
for k, fileName in enumerate(os.listdir(father), int(startNumber)):
    if fileName.endswith("jpg"):
        os.rename(os.path.join(father, fileName), os.path.join(father, prefix + str(k).zfill(5) + ".jpg"))

2 、已经为一些图片标注，然后利用标注清除一个文件夹中没有带labels的图片

python
展开代码
import os
images = r"D:\Desktop\jinggai\images"
labels = r"D:\Desktop\jinggai\labels"
imgFileNameList = os.listdir(images)
labelFileNameList = os.listdir(labels)
for fileName in imgFileNameList:
    if fileName.endswith(".jpg"):
        if fileName[:-4] + ".txt" not in labelFileNameList:
            os.remove(os.path.join(images, fileName))
    else:
        os.remove(os.path.join(images, fileName))
print("当前图片剩余多少张==>", len(imgFileNameList))
print("当前图片标注剩余多少个文件==>", len(labelFileNameList), labelFileNameList)

3、使用算法服务标注数据，将label结果写入到一个文件夹中

python
展开代码
# -*- coding: utf-8 -*-
import os
import requests
import cv2
imgpath = r"D:\Desktop\jinggai\jinggai"
labelspath = r"D:\Desktop\jinggai\labels"
class DataConvert():
    """
    表达方式 说明
    x1,y1,x2,y2 (x1,y1)为左上角坐标，(x2,y2)为右下角坐标
    x1,y1,w,h (x1,y1)为左上角坐标，w为目标区域宽度，h为目标区域高度
    xc,yc,w,h (xc,yc)为目标区域中心坐标，w为目标区域宽度，h为目标区域高度 COCO标注
    """
    def __init__(self):
        pass
    @staticmethod
    def cvtx0y0whTox1y1x2y2(x0, y0, w, h, imgShape):
        # "0.530921 0.666667 0.622368 0.666667"=>(167, 169, 639, 507)
        # labelme 的COCO标注格式就是 中心点x+中心点y+宽+高 （归一化的）
        # 此函数出来的就是 左上点  右下点  （未归一化的）
        height, width, c = imgShape
        x1, y1, x2, y2 = int((x0 - w * 0.5) * width), \
                         int((y0 - h * 0.5) * height), \
                         int((x0 + w * 0.5) * width), \
                         int((y0 + h * 0.5) * height)
        return x1, y1, x2, y2
    @staticmethod
    def cvtx1y1x2y2Tox0y0wh(x1, y1, x2, y2, imgShape):
        # (167, 169, 639, 507)=>"0.530921 0.666667 0.622368 0.666667"
        # 左上点  右下点  （未归一化的） => 中心点x+中心点y+宽+高 （归一化的）
        height, width, c = imgShape
        x0, y0, w, h = (x1 + x2) / 2 / width, (y1 + y2) / 2 / height, (x2 - x1) / width, (y2 - y1) / height,
        return x0, y0, w, h
files = os.listdir(imgpath)
for file in files:
    filep = os.path.join(imgpath, file)
    res = requests.post(url="http://172.18.43.32:8012/alg_analyse_bytes/",
                        files={"file": open(filep, "rb")}).json()
    labelfile = os.path.join(labelspath, file[:-4] + ".txt")
    if len(res["data"]) != 0:
        img = cv2.imread(filep)
        shape = img.shape
        for data in res["data"]:
            bboxs0 = data["bbox"]
            bboxs0 = list(map(int, bboxs0))
            poss = DataConvert.cvtx1y1x2y2Tox0y0wh(*bboxs0, shape)
            poss = list(map(lambda x: str(round(x, 6)), poss))
            open(labelfile, "a+").write("0 " + " ".join(poss) + "\r\n")

4、每张图都必有labels的前提下，让图片文件夹文件数量和label文件夹文件数量一致。

python
展开代码
import os
images = r"D:\Desktop\jinggai\jinggai_nano\images"
labels = r"D:\Desktop\jinggai\jinggai_nano\labels"
imgFileNameList = os.listdir(images)
labelFileNameList = os.listdir(labels)
for fileName in imgFileNameList:
    if fileName.endswith(".jpg"):
        if fileName[:-4] + ".txt" not in labelFileNameList:
            os.remove(os.path.join(images, fileName))
    else:
        os.remove(os.path.join(images, fileName))
imgFileNameList = os.listdir(images)
labelFileNameList = os.listdir(labels)
for fileName in labelFileNameList:
    if fileName.endswith(".txt"):
        if (fileName[:-4] + ".jpg" not in imgFileNameList) or (len(str(open(os.path.join(labels, fileName), "r").readlines())) < 7):
            os.remove(os.path.join(labels, fileName))
    else:
        os.remove(os.path.join(labels, fileName))
imgFileNameList = os.listdir(images)
labelFileNameList = os.listdir(labels)
print("当前图片剩余多少张==>", len(imgFileNameList))
print("当前图片标注剩余多少个文件==>", len(labelFileNameList), labelFileNameList)

5、有图片和对应的xml标注：

将图片全部改名，重写到另外一个文件夹；

将对应的xml标注文件改名；

python
展开代码
import os
import traceback
import cv2
import numpy as np
prefix = input("前缀\n")
startNumber = input("开始的数字\n")
father = r"D:\fireandsmoke\images"
father_labels = r"D:\fireandsmoke\annotations"
for k, fileName in enumerate(os.listdir(father), int(startNumber)):
    try:
        img = cv2.imdecode(np.fromfile(os.path.join(father, fileName), dtype=np.uint8), 1)  # img是矩阵
        cv2.imwrite(os.path.join(r"D:\fireandsmoke\tarimg", prefix + str(k).zfill(5) + ".jpg"), img)
        os.rename(os.path.join(father_labels, fileName.split(".")[0] + ".xml"),
                  os.path.join(father_labels, prefix + str(k).zfill(5) + ".xml"))
    except:
        traceback.print_exc()

6、 xml标记文件转COCO的txt文件。

python
展开代码
import xml.etree.ElementTree as ET
import os
xmldir = r"D:\fireandsmoke\annotations"
txtdir = r"D:\fireandsmoke\labels"
classes = ['fire']  # 标签名
def convert_annotation(img_id_filename):
    image_id = img_id_filename[:-4]  # xml文件的文件名，不带后缀
    in_file = open(os.path.join(xmldir, '%s.xml' % (image_id), ), encoding='UTF-8')  # 打开xml
    out_file = open(os.path.join(txtdir, '%s.txt' % (image_id), ), 'w')  # 最终写入txt文件，这里打开
    tree = ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    size_width = int(size.find('width').text)  # 图片宽
    size_height = int(size.find('height').text)  # 图片高
    for obj in root.iter('object'):  # 遍历标记
        difficult = obj.find('difficult').text  # 是否是难样本
        cls = obj.find('name').text  # 标签分类名
        if cls not in classes or int(difficult) == 1:  # 需要是想要的标签+需要不是难样本
            continue
        cls_id = classes.index(cls)  # COCO里面的第一个数字表示是第几类
        xmlbox = obj.find('bndbox')
        b = [float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text),
             float(xmlbox.find('ymax').text)]  # 注意这里顺序
        if size_width == 0 or size_height == 0:
            print("不合理的图，程序会删除这张图", image_id)
            in_file.close()
            os.remove(os.path.join(xmldir, '%s.xml' % (image_id)))
            continue
        # 标注越界修正
        if b[0] > size_width:
            b[0] = size_width
        if b[1] > size_width:
            b[1] = size_width
        if b[2] > size_height:
            b[2] = size_height
        if b[3] > size_height:
            b[3] = size_height
        txt_data = [((b[0] + b[1]) / 2.0 - 1) / size_width, ((b[2] + b[3]) / 2.0 - 1) / size_height,
                    (b[1] - b[0]) / size_width, (b[3] - b[2]) / size_height]
        txt_data = list(map(lambda x: round(x, 6), txt_data))
        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in txt_data]) + '\n')
    in_file.close()
    out_file.close()
def getClsName():
    clsSet = set()
    xmllist = os.listdir(xmldir)
    for img_id in xmllist:
        img_id_filename = img_id
        image_id = img_id_filename[:-4]  # xml文件的文件名，不带后缀
        in_file = open(os.path.join(xmldir, '%s.xml' % (image_id), ), encoding='UTF-8')  # 打开xml
        tree = ET.parse(in_file)
        root = tree.getroot()
        for obj in root.iter('object'):  # 遍历标记
            cls = obj.find('name').text  # 标签分类名
            clsSet.add(cls)
        in_file.close()
    print(clsSet)
# imglist = os.listdir(imgdir)
# xmllist = os.listdir(xmldir)
# # 测试图片是否损坏
# for img_id in imglist:
#     try:
#         cv2img = cv2.imread(os.path.join(imgdir, img_id))
#     except:
#         os.remove(os.path.join(imgdir, img_id))
if __name__ == '__main__':
    xmllist = os.listdir(xmldir)
    for img_id in xmllist:
        convert_annotation(img_id)