DL代码练习

发表于 2022-07-08 更新于 2025-07-16

1.想做一个图像分类任务，到底分析与实战流程是什么样的？
2.逻辑回归简单demo

1.数据采集

现有数据集

这是一个嘴唇数据集，包括无状态的嘴唇和微笑状态的嘴唇，为本项目的caffe和tensorflow例子使用。
所有图片的尺寸为60*60
0 无状态
1 微笑

爬取

利用 Image-Downloader 开源项目获取数据集

环境：python selumition && chromedriver

爬取命令：

1	python image_downloader.py --engine Bing "嘟嘴"

2.数据处理

检查图像尺寸

python checksize.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0

# checksize.py 内容
for image in os.listdir(sys.argv[1]):
    img = cv2.imread(os.path.join(sys.argv[1], image))
    print(image, " shape is ", img.shape)

重新格式化

python reformat_images.py  F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0

# reformat_images 内容
            try:
                src = cv2.imread(filepath, 1)
                print("src=", filepath, src.shape)
                os.remove(filepath)  # 删除原来图片
                if rename:
                    cv2.imwrite(os.path.join(root, str(num) + ".jpg"), src)  # 写入新的图片，重新命名
                    num = num + 1
                else:
                    cv2.imwrite(os.path.join(root, fileid + ".jpg"), src)  # 写入新的图片，名字不变
            except:
                os.remove(filepath)  # 去除损坏图片

批量图片维度reshape

from PIL import Image
import os


def image_resize(image_path, new_path):  # 统一图片尺寸
    print('============>>修改图片尺寸')
    for img_name in os.listdir(image_path):
        img_path = image_path + "/" + img_name  # 获取该图片全称
        image = Image.open(img_path)  # 打开特定一张图片
        image = image.resize((60, 60))  # 设置需要转换的图片大小
        # process the 1 channel image
        image.save(new_path + '/' + img_name)
    print("end the processing!")


if __name__ == '__main__':
    print("ready for ::::::::  ")
    ori_path = "./data/test"
    new_path = "./data/testtmp"
    image_resize(ori_path, new_path)

产生训练测试文件

python split_train_val.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0

import sys
#valratio:验证集比例
def splittrain_val(fileall,valratio=0.1):
    fileids = fileall.split('.')
    fileid = fileids[len(fileids)-2]
    f=open(fileall);
    ftrain=open(fileid+"_train.txt",'w');
    fval=open(fileid+"_val.txt",'w');
    count = 0
    if valratio == 0 or valratio >= 1:
        valratio = 0.1
    
    interval = (int)(1.0 / valratio)
    while 1:
        line = f.readline()
        if line:
            count = count + 1
            if count % interval == 0:
                fval.write(line)
            else:
                ftrain.write(line)
        else:
            break

splittrain_val(sys.argv[1],0.1)


mouth0.txt
mouth0_train.txt
mouth0_train_newe.txt
mouth0_val.txt
mouth1.txt
mouth1_train.txt
mouth1_val.txt
mouth2.txt
mouth2_train.txt
mouth2_val.txt

随机打乱数据

python shuffle_txt.py F:\code\PyCharmProject\yousan.ai\computer_vision\datas\mouth\0
    
    
def shuffle(file_in,file_out):
    fin = open(file_in,'r')
    fout = open(file_out,'w')

    lines = fin.readlines()
    random.shuffle(lines)
    for line in lines:
        fout.write(line)

shuffle(sys.argv[1],sys.argv[2])

去重

1	python remove_repeat.py

3.模型定义以及训练

定义模型

class simpleconv3(nn.Module):
    ## 初始化函数
    def __init__(self,nclass):
        super(simpleconv3,self).__init__()
        self.conv1 = nn.Conv2d(3, 12, 3, 2) #输入图片大小为3*48*48，输出特征图大小为12*23*23，卷积核大小为3*3，步长为2
        self.bn1 = nn.BatchNorm2d(12)
        self.conv2 = nn.Conv2d(12, 24, 3, 2) #输入图片大小为12*23*23，输出特征图大小为24*11*11，卷积核大小为3*3，步长为2
        self.bn2 = nn.BatchNorm2d(24)
        self.conv3 = nn.Conv2d(24, 48, 3, 2) #输入图片大小为24*11*11，输出特征图大小为48*5*5，卷积核大小为3*3，步长为2
        self.bn3 = nn.BatchNorm2d(48)
        self.fc1 = nn.Linear(48 * 5 * 5 , 1200) #输入向量长为48*5*5=1200，输出向量长为1200
        self.fc2 = nn.Linear(1200 , 128) #输入向量长为1200，输出向量长为128
        self.fc3 = nn.Linear(128 , nclass) #输入向量长为128，输出向量长为nclass，等于类别数

    ## 前向函数
    def forward(self, x):
        ## relu函数，不需要进行实例化，直接进行调用
        ## conv，fc层需要调用nn.Module进行实例化
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = x.view(-1 , 48 * 5 * 5) 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

定义数据集读取器

image_size = 60 ##图像统一缩放大小
crop_size = 48 ##图像裁剪大小，即训练输入大小
nclass = 2 ##分类类别数
model = simpleconv3(nclass) ##创建模型
data_dir = './data' ##数据目录

## 使用torchvision的dataset ImageFolder接口读取数据
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                              data_transforms[x]) for x in ['train', 'val']}
## 创建数据指针，设置batch大小，shuffle，多进程数量
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x],
                                                 batch_size=16,
                                                 shuffle=True,
                                                 num_workers=4) for x in ['train', 'val']}  
                                                 
 ## 获得数据集大小
 dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

定义训练优化器

## 优化目标使用交叉熵，优化方法使用带动量项的SGD，学习率迭代策略为step，每隔100个epoch，变为原来的0.1倍
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=100, gamma=0.1)

定义训练函数

model = train_model(model=model,
                       criterion=criterion,
                       optimizer=optimizer_ft,
                       scheduler=exp_lr_scheduler,
                       num_epochs=300)
                       
                       
 ## 训练主函数
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  ## 设置为训练模式
            else:
                model.train(False)  ## 设置为验证模式

            running_loss = 0.0 ##损失变量
            running_accs = 0.0 ##精度变量
            number_batch = 0 ##
            ## 从dataloaders中获得数据
            # phase=train or phase=valid
            for data in dataloaders[phase]:
                inputs, labels = data 
                if use_gpu:
                    inputs = inputs.cuda()
                    labels = labels.cuda()

                optimizer.zero_grad() ##清空梯度
                outputs = model(inputs) ##前向运行
                _, preds = torch.max(outputs.data, 1) ##使用max()函数对输出值进行操作，得到预测值索引
                loss = criterion(outputs, labels) ##计算损失
                if phase == 'train':
                    loss.backward() ##误差反向传播
                    optimizer.step() ##参数更新

                running_loss += loss.data.item()
                running_accs += torch.sum(preds == labels).item()
                number_batch += 1

            ## 得到每一个epoch的平均损失与精度
            epoch_loss = running_loss / number_batch
            epoch_acc = running_accs / dataset_sizes[phase]
            
            ## 收集精度和损失用于可视化
            if phase == 'train':
                writer.add_scalar('data/trainloss', epoch_loss, epoch)
                writer.add_scalar('data/trainacc', epoch_acc, epoch)
            else:
                writer.add_scalar('data/valloss', epoch_loss, epoch)
                writer.add_scalar('data/valacc', epoch_acc, epoch)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

    writer.close()
    return model

保存训练好的模型

1	torch.save(model.state_dict(),'models/model.pt')

4.模型推理

## 载入模型权重
modelpath = sys.argv[1] 
net.load_state_dict(torch.load(modelpath,map_location=lambda storage,loc: storage))

## 定义预处理函数
data_transforms =  transforms.Compose([
            transforms.Resize(48),
            transforms.ToTensor(),
            transforms.Normalize([0.5,0.5,0.5], [0.5,0.5,0.5])])

## 读取3通道图片，并扩充为4通道tensor
imagepath = sys.argv[2]
image = Image.open(imagepath)
imgblob = data_transforms(image).unsqueeze(0)

## 获得预测结果predict，得到预测的标签值label
predict = net(imgblob)
index = np.argmax(predict.detach().numpy())
## print(predict)
## print(index)

if index == 0:
    print('the predict of '+sys.argv[2]+' is '+str('none'))
else:
    print('the predict of '+sys.argv[2]+' is '+str('smile'))

5.效果演示

import cv2
from net import simpleconv3
import torch
import numpy as np
from torchvision import datasets, models, transforms
from PIL import Image, ImageFont, ImageDraw
import sys

# 展示中文标签
def show_chinese(img,text,pos):
    """
    :param img: opencv 图片
    :param text: 显示的中文字体
    :param pos: 显示位置
    :return:    带有字体的显示图片（包含中文）
    """
    img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    font = ImageFont.truetype(font='msyh.ttc', size=36)
    draw = ImageDraw.Draw(img_pil)
    draw.text(pos, text, font=font, fill=(255, 0, 0))  # PIL中RGB=(255,0,0)表示红色
    img_cv = np.array(img_pil)                         # PIL图片转换为numpy
    img = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)      # PIL格式转换为OpenCV的BGR格式
    return img

###加载模型 
net = simpleconv3(2)  ## 定义模型
net.eval()  ## 设置推理模式，使得dropout和batchnorm等网络层在train和val模式间切换
torch.no_grad()  ## 停止autograd模块的工作，以起到加速和节省显存

## 载入模型权重
modelpath = sys.argv[1]
net = torch.load(modelpath, map_location=lambda storage, loc: storage)

## 定义预处理函数
data_transforms = transforms.Compose([
    transforms.Resize(48),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

# 读取帧
# Create a VideoCapture object and read from input file
cap = cv2.VideoCapture(r'demo1.mp4')
# Check if camera opened successfully
if not cap.isOpened():
    print("Error opening video  file")
# Read until video is completed
while (cap.isOpened()):
    # Capture frame-by-frame
    ret, frame = cap.read()
    if ret:
    ########### 利用模型进行推理start ########### 
    
    frame1 = frame
        (h, w) = frame1.shape[:2]
        print(h, w)
        image = Image.fromarray(frame1)
        image = image.resize((60, 60))
        imgblob = data_transforms(image).unsqueeze(0)
        predict = net(imgblob)
        index = np.argmax(predict.detach().numpy())
        res = ""
        if index == 0:
            print('the predict of ' + ' is ' + str('没有笑'))
            res = '没有笑'
        if index == 1:
            print('the predict of ' + ' is ' + str('微笑'))
            res = '微笑'
        if index == 2:
            print('the predict of ' + ' is ' + str('嘟嘴'))
            res = '嘟嘴'
  ########### 利用模型进行推理end  ########### 
  
  ########### 展示视频帧  ########### 
        frame = show_chinese(frame, res, (50, 50))
        cv2.rectangle(frame,(0,0),(frame.shape[1],frame.shape[0]),(0,0,255))
        # Display the resulting frame
        cv2.imshow('Frame', frame)
        # Press Q on keyboard to  exit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
# Break the loop
    else:
        break
# When everything done, release
# the video capture object
cap.release()
# Closes all the frames
cv2.destroyAllWindows()

逻辑回归demo

logistic回归简单来说和线性回归是一样的，要做的运算同样是 y = w * x b，logistic回归简单的是做二分类问题，使用sigmoid函数将所有的正数和负数都变成0-1之间的数，这样就可以用这个数来确定到底属于哪一类，可以简单的认为概率大于0.5即为第二类，小于0.5为第一类。

sigmoid 函数图像

$图片$

我们这里要做的是多分类问题，对于每一个数据，我们输出的维数是分类的总数，比如10分类，我们输出的就是一个10维的向量，然后我们使用另外一个激活函数，softmax

这就是softmax函数作用的机制，其实简单的理解就是确定这10个数每个数对应的概率有多大，因为这10个数有正有负，

所以先通过指数函数将他们全部变成正数，然后求和，然后这10个数每个数都除以这个和，这样就得到了每个类别的概率。