目前我正在尝试通过将一个卷积分成 4 个子部分并将它们发送到 4 个不同的 GPU 来实现 Group_Convolution,以减少模型的推理时间。我使用多线程来期望它们同时运行。但是,当我通过 nvidia-smi 命令检查 GPU 内部的操作时,数据仍然从 GPU0 传输到 GPU 1、2、3 并按顺序执行,而不是并行执行。你能帮我纠正这个吗?谢谢
这是我的组卷积代码:
class Residual(nn.Module):
def __init__(self, in_channels, out_channels, dev0, dev1, dev2, dev3, down_sample = False, decouple = False):
super(Residual, self).__init__()
self.dev0 = dev0
self.dev1 = dev1
self.dev2 = dev2
self.dev3 = dev3
self.down_sample = down_sample
self.decouple = decouple
# Try testing with hardcode for threading case (test for conv(258, 512))
self.y0 = torch.zeros((100, 128, 7, 7), device = self.dev0)
self.y1 = torch.zeros((100, 128, 7, 7), device = self.dev1)
self.y2 = torch.zeros((100, 128, 7, 7), device = self.dev2)
self.y3 = torch.zeros((100, 128, 7, 7), device = self.dev3)
# End hardcode (testing purpose)
if (in_channels == out_channels):
if (self.decouple): # Check Grouped Convolution or NOT
self.conv1a = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev0)
self.conv1b = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev1)
self.conv1c = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev2)
self.conv1d = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =1, padding=1, bias = False).to(self.dev3)
else:
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride =1, padding = 1, bias = False).to(self.dev0)
else:
if (self.decouple):
self.conv1a = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev0)
self.conv1b = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev1)
self.conv1c = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev2)
self.conv1d = nn.Conv2d(int(in_channels/4), int(out_channels/4), kernel_size = 3, stride =2, padding=1, bias = False).to(self.dev3)
else:
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride =2, padding = 1, bias = False).to(self.dev0)
...
def Group_Conv(self, device, in_tensor, out_tensor):
out_tensor = nn.Conv2d(in_channel, out_channel, kernel_size = 3, stride =2, padding =1, bias = False).to(device)
def forward(self, x):
if (self.decouple):
a = torch.chunk(x, 4, dim = 1) # Devide feature maps into 4 sub-part following the channel
# GConv() function for 4 devices concurrently
Thread(target = self.Group_Conv(self.dev0, a[0], self.y0)).start()
Thread(target = self.Group_Conv(self.dev1, a[1], self.y1)).start()
Thread(target = self.Group_Conv(self.dev2, a[2], self.y2)).start()
Thread(target = self.Group_Conv(self.dev3, a[3], self.y3)).start()
out = torch.cat([self.y0, self.y1.to(self.dev0), self.y2.to(self.dev0), self.y3.to(self.dev0)], dim = 1)
else:
out = self.conv1(x)
....