DL代码练习

1.想做一个图像分类任务,到底分析与实战流程是什么样的?
2.逻辑回归简单demo

1.数据采集

现有数据集

1
2
3
4
这是一个嘴唇数据集,包括无状态的嘴唇和微笑状态的嘴唇,为本项目的caffe和tensorflow例子使用。
所有图片的尺寸为60*60
0 无状态
1 微笑

爬取

利用 Image-Downloader 开源项目获取数据集

环境:python selumition && chromedriver

爬取命令:

1
python image_downloader.py --engine Bing "嘟嘴"

2.数据处理

检查图像尺寸

1
2
3
4
5
6
python checksize.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0

# checksize.py 内容
for image in os.listdir(sys.argv[1]):
img = cv2.imread(os.path.join(sys.argv[1], image))
print(image, " shape is ", img.shape)

重新格式化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
python reformat_images.py  F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0

# reformat_images 内容
try:
src = cv2.imread(filepath, 1)
print("src=", filepath, src.shape)
os.remove(filepath) # 删除原来图片
if rename:
cv2.imwrite(os.path.join(root, str(num) + ".jpg"), src) # 写入新的图片,重新命名
num = num + 1
else:
cv2.imwrite(os.path.join(root, fileid + ".jpg"), src) # 写入新的图片,名字不变
except:
os.remove(filepath) # 去除损坏图片

批量图片维度reshape

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from PIL import Image
import os


def image_resize(image_path, new_path): # 统一图片尺寸
print('============>>修改图片尺寸')
for img_name in os.listdir(image_path):
img_path = image_path + "/" + img_name # 获取该图片全称
image = Image.open(img_path) # 打开特定一张图片
image = image.resize((60, 60)) # 设置需要转换的图片大小
# process the 1 channel image
image.save(new_path + '/' + img_name)
print("end the processing!")


if __name__ == '__main__':
print("ready for :::::::: ")
ori_path = "./data/test"
new_path = "./data/testtmp"
image_resize(ori_path, new_path)

产生训练测试文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
python split_train_val.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0

import sys
#valratio:验证集比例
def splittrain_val(fileall,valratio=0.1):
fileids = fileall.split('.')
fileid = fileids[len(fileids)-2]
f=open(fileall);
ftrain=open(fileid+"_train.txt",'w');
fval=open(fileid+"_val.txt",'w');
count = 0
if valratio == 0 or valratio >= 1:
valratio = 0.1

interval = (int)(1.0 / valratio)
while 1:
line = f.readline()
if line:
count = count + 1
if count % interval == 0:
fval.write(line)
else:
ftrain.write(line)
else:
break

splittrain_val(sys.argv[1],0.1)


mouth0.txt
mouth0_train.txt
mouth0_train_newe.txt
mouth0_val.txt
mouth1.txt
mouth1_train.txt
mouth1_val.txt
mouth2.txt
mouth2_train.txt
mouth2_val.txt

随机打乱数据

1
2
3
4
5
6
7
8
9
10
11
12
13
python shuffle_txt.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0


def shuffle(file_in,file_out):
fin = open(file_in,'r')
fout = open(file_out,'w')

lines = fin.readlines()
random.shuffle(lines)
for line in lines:
fout.write(line)

shuffle(sys.argv[1],sys.argv[2])

去重

1
python remove_repeat.py

3.模型定义以及训练

定义模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class simpleconv3(nn.Module):
## 初始化函数
def __init__(self,nclass):
super(simpleconv3,self).__init__()
self.conv1 = nn.Conv2d(3, 12, 3, 2) #输入图片大小为3*48*48,输出特征图大小为12*23*23,卷积核大小为3*3,步长为2
self.bn1 = nn.BatchNorm2d(12)
self.conv2 = nn.Conv2d(12, 24, 3, 2) #输入图片大小为12*23*23,输出特征图大小为24*11*11,卷积核大小为3*3,步长为2
self.bn2 = nn.BatchNorm2d(24)
self.conv3 = nn.Conv2d(24, 48, 3, 2) #输入图片大小为24*11*11,输出特征图大小为48*5*5,卷积核大小为3*3,步长为2
self.bn3 = nn.BatchNorm2d(48)
self.fc1 = nn.Linear(48 * 5 * 5 , 1200) #输入向量长为48*5*5=1200,输出向量长为1200
self.fc2 = nn.Linear(1200 , 128) #输入向量长为1200,输出向量长为128
self.fc3 = nn.Linear(128 , nclass) #输入向量长为128,输出向量长为nclass,等于类别数

## 前向函数
def forward(self, x):
## relu函数,不需要进行实例化,直接进行调用
## conv,fc层需要调用nn.Module进行实例化
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
x = x.view(-1 , 48 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x

定义数据集读取器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
image_size = 60 ##图像统一缩放大小
crop_size = 48 ##图像裁剪大小,即训练输入大小
nclass = 2 ##分类类别数
model = simpleconv3(nclass) ##创建模型
data_dir = './data' ##数据目录

## 使用torchvision的dataset ImageFolder接口读取数据
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x]) for x in ['train', 'val']}
## 创建数据指针,设置batch大小,shuffle,多进程数量
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x],
batch_size=16,
shuffle=True,
num_workers=4) for x in ['train', 'val']}

## 获得数据集大小
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

定义训练优化器

1
2
3
4
5
## 优化目标使用交叉熵,优化方法使用带动量项的SGD,学习率迭代策略为step,每隔100个epoch,变为原来的0.1倍
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=100, gamma=0.1)

定义训练函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
model = train_model(model=model,
criterion=criterion,
optimizer=optimizer_ft,
scheduler=exp_lr_scheduler,
num_epochs=300)


## 训练主函数
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train(True) ## 设置为训练模式
else:
model.train(False) ## 设置为验证模式

running_loss = 0.0 ##损失变量
running_accs = 0.0 ##精度变量
number_batch = 0 ##
## 从dataloaders中获得数据
# phase=train or phase=valid
for data in dataloaders[phase]:
inputs, labels = data
if use_gpu:
inputs = inputs.cuda()
labels = labels.cuda()

optimizer.zero_grad() ##清空梯度
outputs = model(inputs) ##前向运行
_, preds = torch.max(outputs.data, 1) ##使用max()函数对输出值进行操作,得到预测值索引
loss = criterion(outputs, labels) ##计算损失
if phase == 'train':
loss.backward() ##误差反向传播
optimizer.step() ##参数更新

running_loss += loss.data.item()
running_accs += torch.sum(preds == labels).item()
number_batch += 1

## 得到每一个epoch的平均损失与精度
epoch_loss = running_loss / number_batch
epoch_acc = running_accs / dataset_sizes[phase]

## 收集精度和损失用于可视化
if phase == 'train':
writer.add_scalar('data/trainloss', epoch_loss, epoch)
writer.add_scalar('data/trainacc', epoch_acc, epoch)
else:
writer.add_scalar('data/valloss', epoch_loss, epoch)
writer.add_scalar('data/valacc', epoch_acc, epoch)

print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))

writer.close()
return model

保存训练好的模型

1
torch.save(model.state_dict(),'models/model.pt')

4.模型推理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
## 载入模型权重
modelpath = sys.argv[1]
net.load_state_dict(torch.load(modelpath,map_location=lambda storage,loc: storage))

## 定义预处理函数
data_transforms = transforms.Compose([
transforms.Resize(48),
transforms.ToTensor(),
transforms.Normalize([0.5,0.5,0.5], [0.5,0.5,0.5])])

## 读取3通道图片,并扩充为4通道tensor
imagepath = sys.argv[2]
image = Image.open(imagepath)
imgblob = data_transforms(image).unsqueeze(0)

## 获得预测结果predict,得到预测的标签值label
predict = net(imgblob)
index = np.argmax(predict.detach().numpy())
## print(predict)
## print(index)

if index == 0:
print('the predict of '+sys.argv[2]+' is '+str('none'))
else:
print('the predict of '+sys.argv[2]+' is '+str('smile'))

5.效果演示

1
2
3
4
5
6
7
import cv2
from net import simpleconv3
import torch
import numpy as np
from torchvision import datasets, models, transforms
from PIL import Image, ImageFont, ImageDraw
import sys
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 展示中文标签
def show_chinese(img,text,pos):
"""
:param img: opencv 图片
:param text: 显示的中文字体
:param pos: 显示位置
:return: 带有字体的显示图片(包含中文)
"""
img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
font = ImageFont.truetype(font='msyh.ttc', size=36)
draw = ImageDraw.Draw(img_pil)
draw.text(pos, text, font=font, fill=(255, 0, 0)) # PIL中RGB=(255,0,0)表示红色
img_cv = np.array(img_pil) # PIL图片转换为numpy
img = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR) # PIL格式转换为OpenCV的BGR格式
return img
1
2
3
4
5
6
7
8
9
10
11
12
13
14
###加载模型 
net = simpleconv3(2) ## 定义模型
net.eval() ## 设置推理模式,使得dropout和batchnorm等网络层在train和val模式间切换
torch.no_grad() ## 停止autograd模块的工作,以起到加速和节省显存

## 载入模型权重
modelpath = sys.argv[1]
net = torch.load(modelpath, map_location=lambda storage, loc: storage)

## 定义预处理函数
data_transforms = transforms.Compose([
transforms.Resize(48),
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# 读取帧
# Create a VideoCapture object and read from input file
cap = cv2.VideoCapture(r'demo1.mp4')
# Check if camera opened successfully
if not cap.isOpened():
print("Error opening video file")
# Read until video is completed
while (cap.isOpened()):
# Capture frame-by-frame
ret, frame = cap.read()
if ret:
########### 利用模型进行推理start ###########

frame1 = frame
(h, w) = frame1.shape[:2]
print(h, w)
image = Image.fromarray(frame1)
image = image.resize((60, 60))
imgblob = data_transforms(image).unsqueeze(0)
predict = net(imgblob)
index = np.argmax(predict.detach().numpy())
res = ""
if index == 0:
print('the predict of ' + ' is ' + str('没有笑'))
res = '没有笑'
if index == 1:
print('the predict of ' + ' is ' + str('微笑'))
res = '微笑'
if index == 2:
print('the predict of ' + ' is ' + str('嘟嘴'))
res = '嘟嘴'
########### 利用模型进行推理end ###########

########### 展示视频帧 ###########
frame = show_chinese(frame, res, (50, 50))
cv2.rectangle(frame,(0,0),(frame.shape[1],frame.shape[0]),(0,0,255))
# Display the resulting frame
cv2.imshow('Frame', frame)
# Press Q on keyboard to exit
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Break the loop
else:
break
# When everything done, release
# the video capture object
cap.release()
# Closes all the frames
cv2.destroyAllWindows()

逻辑回归demo

logistic回归简单来说和线性回归是一样的,要做的运算同样是 y = w * x b,logistic回归简单的是做二分类问题,使用sigmoid函数将所有的正数和负数都变成0-1之间的数,这样就可以用这个数来确定到底属于哪一类,可以简单的认为概率大于0.5即为第二类,小于0.5为第一类。

图片

sigmoid 函数图像

图片

我们这里要做的是多分类问题,对于每一个数据,我们输出的维数是分类的总数,比如10分类,我们输出的就是一个10维的向量,然后我们使用另外一个激活函数,softmax

图片

这就是softmax函数作用的机制,其实简单的理解就是确定这10个数每个数对应的概率有多大,因为这10个数有正有负,

所以先通过指数函数将他们全部变成正数,然后求和,然后这10个数每个数都除以这个和,这样就得到了每个类别的概率。