在科研助手使用并行计算训练模型 数据准备 数据包括训练代码、训练数据和训练环境。 1、训练代码 本示例的训练代码包括两部分,如下所述: (1)启动脚本 本示例提供一个启动脚本start.sh,可以简化创建任务填写的启动命令内容。内容如下: sudo /opt/conda/bin/python3 /storage/mnist/bcmnist.py 您可以按照实际情况调整启动脚本的内容。 (2)模型训练代码 本示例的训练代码包括训练和验证两部分内容,参考官方提供的示例代码进行改写。bcmnist.py 内容如下: from future import printfunction import argparse import os import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.onnx from torchvision import datasets, transforms from torch.optim.lrscheduler import StepLR class Net(nn.Module): def init(self): super(Net, self).init() self.conv1 nn.Conv2d(1, 32, 3, 1) self.conv2 nn.Conv2d(32, 64, 3, 1) self.dropout1 nn.Dropout(0.25) self.dropout2 nn.Dropout(0.5) self.fc1 nn.Linear(9216, 128) self.fc2 nn.Linear(128, 10) def forward(self, x): x self.conv1(x) x F.relu(x) x self.conv2(x) x F.relu(x) x F.maxpool2d(x, 2) x self.dropout1(x) x torch.flatten(x, 1) x self.fc1(x) x F.relu(x) x self.dropout2(x) x self.fc2(x) output F.logsoftmax(x, dim1) return output def train(args, model, device, trainloader, optimizer, epoch): model.train() for batchidx, (data, target) in enumerate(trainloader): data, target data.to(device), target.to(device) optimizer.zerograd() output model(data) loss F.nllloss(output, target) loss.backward() optimizer.step() if batchidx % args.loginterval 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]tLoss: {:.6f}'.format( epoch, batchidx len(data), len(trainloader.dataset), 100. batchidx / len(trainloader), loss.item())) if args.dryrun: break def test(model, device, testloader): model.eval() testloss 0 correct 0 with torch.nograd(): for data, target in testloader: data, target data.to(device), target.to(device) output model(data) testloss + F.nllloss(output, target, reduction'sum').item()