这种跨步复制的通用函数大致如下所示:
void cudaMemcpyStrided(
void *dst, int dstStride,
void *src, int srcStride,
int numElements, int elementSize, int kind) {
int srcPitchInBytes = srcStride * elementSize;
int dstPitchInBytes = dstStride * elementSize;
int width = 1 * elementSize;
int height = numElements;
cudaMemcpy2D(
dst, dstPitchInBytes,
src, srcPitchInBytes,
width, height,
kind);
}
对于您的示例,它可以称为
cudaMemcpyStrided(dest, 3, source, 2, 3, sizeof(double), cudaMemcpyDeviceToDevice);
“大致”,因为我只是从我测试它的(基于 Java/JCuda 的)代码中即时翻译它:
import static jcuda.runtime.JCuda.cudaMemcpy2D;
import java.util.Arrays;
import java.util.Locale;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.runtime.cudaMemcpyKind;
public class JCudaStridedMemcopy {
public static void main(String[] args) {
int dstLength = 9;
int srcLength = 6;
int dstStride = 3;
int srcStride = 2;
int numElements = 3;
runExample(dstLength, dstStride, srcLength, srcStride, numElements);
dstLength = 9;
srcLength = 12;
dstStride = 3;
srcStride = 4;
numElements = 3;
runExample(dstLength, dstStride, srcLength, srcStride, numElements);
dstLength = 18;
srcLength = 12;
dstStride = 3;
srcStride = 2;
numElements = 6;
runExample(dstLength, dstStride, srcLength, srcStride, numElements);
}
private static void runExample(int dstLength, int dstStride, int srcLength, int srcStride, int numElements) {
double dst[] = new double[dstLength];
double src[] = new double[srcLength];
for (int i = 0; i < src.length; i++) {
src[i] = i;
}
cudaMemcpyStrided(dst, dstStride, src, srcStride, numElements);
System.out.println("Copy " + numElements + " elements");
System.out.println(" to array with length " + dstLength + ", with a stride of " + dstStride);
System.out.println(" from array with length " + srcLength + ", with a stride of " + srcStride);
System.out.println("");
System.out.println("Destination:");
System.out.println(toString2D(dst, dstStride));
System.out.println("Flat: " + Arrays.toString(dst));
System.out.println("");
System.out.println("Source:");
System.out.println(toString2D(src, srcStride));
System.out.println("Flat: " + Arrays.toString(src));
System.out.println("");
System.out.println("Done");
System.out.println("");
}
private static void cudaMemcpyStrided(double dst[], int dstStride, double src[], int srcStride, int numElements) {
long srcPitchInBytes = srcStride * Sizeof.DOUBLE;
long dstPitchInBytes = dstStride * Sizeof.DOUBLE;
long width = 1 * Sizeof.DOUBLE;
long height = numElements;
cudaMemcpy2D(
Pointer.to(dst), dstPitchInBytes,
Pointer.to(src), srcPitchInBytes,
width, height,
cudaMemcpyKind.cudaMemcpyHostToHost);
}
public static String toString2D(double[] a, long columns) {
String format = "%4.1f ";
;
StringBuilder sb = new StringBuilder();
for (int i = 0; i < a.length; i++) {
if (i > 0 && i % columns == 0) {
sb.append("\n");
}
sb.append(String.format(Locale.ENGLISH, format, a[i]));
}
return sb.toString();
}
}
为了了解函数的作用,基于示例/测试用例,以下是输出:
Copy 3 elements
to array with length 9, with a stride of 3
from array with length 6, with a stride of 2
Destination:
0.0 0.0 0.0
2.0 0.0 0.0
4.0 0.0 0.0
Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0]
Source:
0.0 1.0
2.0 3.0
4.0 5.0
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
Done
Copy 3 elements
to array with length 9, with a stride of 3
from array with length 12, with a stride of 4
Destination:
0.0 0.0 0.0
4.0 0.0 0.0
8.0 0.0 0.0
Flat: [0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 8.0, 0.0, 0.0]
Source:
0.0 1.0 2.0 3.0
4.0 5.0 6.0 7.0
8.0 9.0 10.0 11.0
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
Done
Copy 6 elements
to array with length 18, with a stride of 3
from array with length 12, with a stride of 2
Destination:
0.0 0.0 0.0
2.0 0.0 0.0
4.0 0.0 0.0
6.0 0.0 0.0
8.0 0.0 0.0
10.0 0.0 0.0
Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0, 6.0, 0.0, 0.0, 8.0, 0.0, 0.0, 10.0, 0.0, 0.0]
Source:
0.0 1.0
2.0 3.0
4.0 5.0
6.0 7.0
8.0 9.0
10.0 11.0
Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
Done