widerface数据集下载及转voc2007格式深度学习

widerface数据集转为voc2007格式（转）

- widerface数据集下载
- 转为voc2007格式
- 百度云下载

参考博客：转自于该博客
自己又重新整理了一下
widerface数据集下载数据集下载
下载这四个文件

文章图片

转为voc2007格式

from skimage import io import shutil import random import os import string headstr = """\VOC2007 %06d.jpgMy Database PASCAL VOC2007 flickr NULL NULL company %d %d %d0 """ objstr = """\ """ tailstr = '''\''' def all_path(filename): return os.path.join('XXXX/XXXXX', filename)###注意修改文件夹地址 def writexml(idx, head, bbxes, tail): filename = all_path("Annotations/%06d.xml" % (idx)) f = open(filename, "w") f.write(head) for bbx in bbxes: f.write(objstr % ('face', bbx[0], bbx[1], bbx[0] + bbx[2], bbx[1] + bbx[3])) f.write(tail) f.close() def clear_dir(): if shutil.os.path.exists(all_path('Annotations')): shutil.rmtree(all_path('Annotations')) if shutil.os.path.exists(all_path('ImageSets')): shutil.rmtree(all_path('ImageSets')) if shutil.os.path.exists(all_path('JPEGImages')): shutil.rmtree(all_path('JPEGImages')) shutil.os.mkdir(all_path('Annotations')) shutil.os.makedirs(all_path('ImageSets/Main')) shutil.os.mkdir(all_path('JPEGImages')) def excute_datasets(idx, datatype): f = open(all_path('ImageSets/Main/' + datatype + '.txt'), 'a') f_bbx = open(all_path('wider_face_split/wider_face_' + datatype + '_bbx_gt.txt'), 'r') while True: filename = f_bbx.readline().strip('\n') if not filename: break im = io.imread(all_path('WIDER_' + datatype + '/images/'+filename)) head = headstr % (idx, im.shape[1], im.shape[0], im.shape[2]) nums = f_bbx.readline().strip('\n') bbxes = [] for ind in range(int(nums)): bbx_info = f_bbx.readline().strip(' \n').split(' ') bbx = [int(bbx_info[i]) for i in range(len(bbx_info))] #x1, y1, w, h, blur, expression, illumination, invalid, occlusion, pose if bbx[7]==0: bbxes.append(bbx) writexml(idx, head, bbxes, tailstr) shutil.copyfile(all_path('WIDER_' + datatype + '/images/'+filename), all_path('JPEGImages/%06d.jpg' % (idx))) f.write('%06d\n' % (idx)) idx +=1 f.close() f_bbx.close() return idx # 打乱样本 def shuffle_file(filename): f = open(filename, 'r+') lines = f.readlines() random.shuffle(lines) f.seek(0) f.truncate() f.writelines(lines) f.close() if __name__ == '__main__': clear_dir() idx = 1 idx = excute_datasets(idx, 'train') idx = excute_datasets(idx, 'val')

注意：下载的原始数据标注会有几个出现问题
如下：

文章图片

所以找到 wider_face_split 文件夹中wider_face_train_bbx_gt.txt对应的错误标签即可（注释是空的）如下

文章图片

下面列出来自己找的错误的标注：
10422行：0_Parade_Parade_0_452.jpg
86537行：2_Demonstration_Political_Rally_2_444.jpg
133392行：39_Ice_Skating_iceskiing_39_380.jpg
145712行：46_Jockey_Jockey_46_576.jpg
百度云下载 【widerface数据集下载及转voc2007格式】链接
提取码：28g6