1.想做一个图像分类任务,到底分析与实战流程是什么样的? 2.逻辑回归简单demo
1.数据采集 现有数据集 1 2 3 4 这是一个嘴唇数据集,包括无状态的嘴唇和微笑状态的嘴唇,为本项目的caffe和tensorflow例子使用。 所有图片的尺寸为60*60 0 无状态 1 微笑
爬取 利用 Image-Downloader 开源项目获取数据集
环境:python selumition && chromedriver
爬取命令:
1 python image_downloader.py --engine Bing "嘟嘴"
2.数据处理 检查图像尺寸 1 2 3 4 5 6 python checksize.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0 # checksize.py 内容 for image in os.listdir(sys.argv[1]): img = cv2.imread(os.path.join(sys.argv[1], image)) print(image, " shape is ", img.shape)
重新格式化 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 python reformat_images.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0 # reformat_images 内容 try: src = cv2.imread(filepath, 1) print("src=", filepath, src.shape) os.remove(filepath) # 删除原来图片 if rename: cv2.imwrite(os.path.join(root, str(num) + ".jpg"), src) # 写入新的图片,重新命名 num = num + 1 else: cv2.imwrite(os.path.join(root, fileid + ".jpg"), src) # 写入新的图片,名字不变 except: os.remove(filepath) # 去除损坏图片
批量图片维度reshape 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 from PIL import Image import os def image_resize(image_path, new_path): # 统一图片尺寸 print('============>>修改图片尺寸') for img_name in os.listdir(image_path): img_path = image_path + "/" + img_name # 获取该图片全称 image = Image.open(img_path) # 打开特定一张图片 image = image.resize((60, 60)) # 设置需要转换的图片大小 # process the 1 channel image image.save(new_path + '/' + img_name) print("end the processing!") if __name__ == '__main__': print("ready for :::::::: ") ori_path = "./data/test" new_path = "./data/testtmp" image_resize(ori_path, new_path)
产生训练测试文件 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 python split_train_val.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0 import sys #valratio:验证集比例 def splittrain_val(fileall,valratio=0.1): fileids = fileall.split('.') fileid = fileids[len(fileids)-2] f=open(fileall); ftrain=open(fileid+"_train.txt",'w'); fval=open(fileid+"_val.txt",'w'); count = 0 if valratio == 0 or valratio >= 1: valratio = 0.1 interval = (int)(1.0 / valratio) while 1: line = f.readline() if line: count = count + 1 if count % interval == 0: fval.write(line) else: ftrain.write(line) else: break splittrain_val(sys.argv[1],0.1) mouth0.txt mouth0_train.txt mouth0_train_newe.txt mouth0_val.txt mouth1.txt mouth1_train.txt mouth1_val.txt mouth2.txt mouth2_train.txt mouth2_val.txt
随机打乱数据 1 2 3 4 5 6 7 8 9 10 11 12 13 python shuffle_txt.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0 def shuffle (file_in,file_out ): fin = open (file_in,'r' ) fout = open (file_out,'w' ) lines = fin.readlines() random.shuffle(lines) for line in lines: fout.write(line) shuffle(sys.argv[1 ],sys.argv[2 ])
去重
3.模型定义以及训练 定义模型 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 class simpleconv3(nn.Module): ## 初始化函数 def __init__(self,nclass): super(simpleconv3,self).__init__() self.conv1 = nn.Conv2d(3, 12, 3, 2) #输入图片大小为3*48*48,输出特征图大小为12*23*23,卷积核大小为3*3,步长为2 self.bn1 = nn.BatchNorm2d(12) self.conv2 = nn.Conv2d(12, 24, 3, 2) #输入图片大小为12*23*23,输出特征图大小为24*11*11,卷积核大小为3*3,步长为2 self.bn2 = nn.BatchNorm2d(24) self.conv3 = nn.Conv2d(24, 48, 3, 2) #输入图片大小为24*11*11,输出特征图大小为48*5*5,卷积核大小为3*3,步长为2 self.bn3 = nn.BatchNorm2d(48) self.fc1 = nn.Linear(48 * 5 * 5 , 1200) #输入向量长为48*5*5=1200,输出向量长为1200 self.fc2 = nn.Linear(1200 , 128) #输入向量长为1200,输出向量长为128 self.fc3 = nn.Linear(128 , nclass) #输入向量长为128,输出向量长为nclass,等于类别数 ## 前向函数 def forward(self, x): ## relu函数,不需要进行实例化,直接进行调用 ## conv,fc层需要调用nn.Module进行实例化 x = F.relu(self.bn1(self.conv1(x))) x = F.relu(self.bn2(self.conv2(x))) x = F.relu(self.bn3(self.conv3(x))) x = x.view(-1 , 48 * 5 * 5) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x
定义数据集读取器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 image_size = 60 ##图像统一缩放大小 crop_size = 48 ##图像裁剪大小,即训练输入大小 nclass = 2 ##分类类别数 model = simpleconv3(nclass) ##创建模型 data_dir = './data' ##数据目录 ## 使用torchvision的dataset ImageFolder接口读取数据 image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']} ## 创建数据指针,设置batch大小,shuffle,多进程数量 dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=16, shuffle=True, num_workers=4) for x in ['train', 'val']} ## 获得数据集大小 dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
定义训练优化器 1 2 3 4 5 ## 优化目标使用交叉熵,优化方法使用带动量项的SGD,学习率迭代策略为step,每隔100个epoch,变为原来的0.1倍 criterion = nn.CrossEntropyLoss() optimizer_ft = optim.SGD(model.parameters(), lr=0.1, momentum=0.9) exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=100, gamma=0.1)
定义训练函数 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 model = train_model(model=model, criterion=criterion, optimizer=optimizer_ft, scheduler=exp_lr_scheduler, num_epochs=300) ## 训练主函数 def train_model(model, criterion, optimizer, scheduler, num_epochs=25): for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) for phase in ['train', 'val']: if phase == 'train': scheduler.step() model.train(True) ## 设置为训练模式 else: model.train(False) ## 设置为验证模式 running_loss = 0.0 ##损失变量 running_accs = 0.0 ##精度变量 number_batch = 0 ## ## 从dataloaders中获得数据 # phase=train or phase=valid for data in dataloaders[phase]: inputs, labels = data if use_gpu: inputs = inputs.cuda() labels = labels.cuda() optimizer.zero_grad() ##清空梯度 outputs = model(inputs) ##前向运行 _, preds = torch.max(outputs.data, 1) ##使用max()函数对输出值进行操作,得到预测值索引 loss = criterion(outputs, labels) ##计算损失 if phase == 'train': loss.backward() ##误差反向传播 optimizer.step() ##参数更新 running_loss += loss.data.item() running_accs += torch.sum(preds == labels).item() number_batch += 1 ## 得到每一个epoch的平均损失与精度 epoch_loss = running_loss / number_batch epoch_acc = running_accs / dataset_sizes[phase] ## 收集精度和损失用于可视化 if phase == 'train': writer.add_scalar('data/trainloss', epoch_loss, epoch) writer.add_scalar('data/trainacc', epoch_acc, epoch) else: writer.add_scalar('data/valloss', epoch_loss, epoch) writer.add_scalar('data/valacc', epoch_acc, epoch) print('{} Loss: {:.4f} Acc: {:.4f}'.format( phase, epoch_loss, epoch_acc)) writer.close() return model
保存训练好的模型 1 torch.save(model.state_dict(),'models/model.pt')
4.模型推理 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 ## 载入模型权重 modelpath = sys.argv[1] net.load_state_dict(torch.load(modelpath,map_location=lambda storage,loc: storage)) ## 定义预处理函数 data_transforms = transforms.Compose([ transforms.Resize(48), transforms.ToTensor(), transforms.Normalize([0.5,0.5,0.5], [0.5,0.5,0.5])]) ## 读取3通道图片,并扩充为4通道tensor imagepath = sys.argv[2] image = Image.open(imagepath) imgblob = data_transforms(image).unsqueeze(0) ## 获得预测结果predict,得到预测的标签值label predict = net(imgblob) index = np.argmax(predict.detach().numpy()) ## print(predict) ## print(index) if index == 0: print('the predict of '+sys.argv[2]+' is '+str('none')) else: print('the predict of '+sys.argv[2]+' is '+str('smile'))
5.效果演示 1 2 3 4 5 6 7 import cv2from net import simpleconv3import torchimport numpy as npfrom torchvision import datasets, models, transformsfrom PIL import Image, ImageFont, ImageDrawimport sys
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 def show_chinese (img,text,pos ): """ :param img: opencv 图片 :param text: 显示的中文字体 :param pos: 显示位置 :return: 带有字体的显示图片(包含中文) """ img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) font = ImageFont.truetype(font='msyh.ttc' , size=36 ) draw = ImageDraw.Draw(img_pil) draw.text(pos, text, font=font, fill=(255 , 0 , 0 )) img_cv = np.array(img_pil) img = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR) return img
1 2 3 4 5 6 7 8 9 10 11 12 13 14 net = simpleconv3(2 ) net.eval () torch.no_grad() modelpath = sys.argv[1 ] net = torch.load(modelpath, map_location=lambda storage, loc: storage) data_transforms = transforms.Compose([ transforms.Resize(48 ), transforms.ToTensor(), transforms.Normalize([0.5 , 0.5 , 0.5 ], [0.5 , 0.5 , 0.5 ])])
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 cap = cv2.VideoCapture(r'demo1.mp4' ) if not cap.isOpened(): print ("Error opening video file" ) while (cap.isOpened()): ret, frame = cap.read() if ret: frame1 = frame (h, w) = frame1.shape[:2 ] print (h, w) image = Image.fromarray(frame1) image = image.resize((60 , 60 )) imgblob = data_transforms(image).unsqueeze(0 ) predict = net(imgblob) index = np.argmax(predict.detach().numpy()) res = "" if index == 0 : print ('the predict of ' + ' is ' + str ('没有笑' )) res = '没有笑' if index == 1 : print ('the predict of ' + ' is ' + str ('微笑' )) res = '微笑' if index == 2 : print ('the predict of ' + ' is ' + str ('嘟嘴' )) res = '嘟嘴' frame = show_chinese(frame, res, (50 , 50 )) cv2.rectangle(frame,(0 ,0 ),(frame.shape[1 ],frame.shape[0 ]),(0 ,0 ,255 )) cv2.imshow('Frame' , frame) if cv2.waitKey(1 ) & 0xFF == ord ('q' ): break else : break cap.release() cv2.destroyAllWindows()
逻辑回归demo logistic回归简单来说和线性回归是一样的,要做的运算同样是 y = w * x b,logistic回归简单的是做二分类问题,使用sigmoid函数将所有的正数和负数都变成0-1之间的数,这样就可以用这个数来确定到底属于哪一类,可以简单的认为概率大于0.5即为第二类,小于0.5为第一类。
sigmoid 函数图像
我们这里要做的是多分类问题 ,对于每一个数据,我们输出的维数是分类的总数,比如10分类,我们输出的就是一个10维的向量,然后我们使用另外一个激活函数,softmax
这就是softmax函数作用的机制,其实简单的理解就是确定这10个数每个数对应的概率有多大,因为这10个数有正有负,
所以先通过指数函数将他们全部变成正数,然后求和,然后这10个数每个数都除以这个和,这样就得到了每个类别的概率。