Estoy tratando de asignar la memoria del dispositivo, copiarlo, realizar los cálculos en la GPU, copiar los resultados y luego liberar la memoria del dispositivo que asigné. Quería asegurarme de no sobrepasar el límite y quería ver si tendría suficiente memoria en el espacio de memoria compartida para volcar algunas matrices.¿Por qué CudaFree no parece liberar memoria?
Cuando asigno la memoria del dispositivo, no se devuelve ningún error. Cuando uso cudaMemGetInfo
para verificar la cantidad de memoria asignada, parece que cudaMalloc
no ha asignado ninguna memoria. Además, cuando intento liberar la memoria, parece que solo se libera un puntero.
Estoy utilizando la interfaz matlab Mexfunction
para configurar la memoria de la GPU y ejecutar el kernel. En este punto, ni siquiera estoy llamando al kernel y simplemente regreso una matriz de unidades para los resultados.
cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);
/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;
/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc((void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));
/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* call the kernel */
// launchKernel<<<1,512>>>(........);
/* retireve the output */
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* free the memory. */
mexPrintf("\nFree'ing memory.\n");
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);
cudaErr = cudaFree(devicePulseDelay);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceTarDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScattDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScatterers);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedReal);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedImag);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
Aquí está la salida de esta:
Memory avaliable: Free: 2523959296, Total: 2818572288 Allocating memory. devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Copying memory. devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Free'ing memory. Before freeing: Free 2513473536, Total: 2818572288 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
Siento que hay algo obvio que me falta. ¿Alguien puede ayudar a explicar lo que está pasando?
EDIT: plataforma es Windows 7 con una tarjeta Tesla C2050 GPu.
¿En qué plataforma está ejecutando este código? – talonmies
Intenta poner a cero los valores de allocMem y totalMem antes de cada llamada a cudaMemGetInfo() y comprueba el valor de retorno de cudaMemGetInfo(). –
poniendo a cero el allocMem y totalMem antes de cada llamada a cudaMemGetInfo() no hizo ninguna diferencia. Además, las llamadas cudaMemGetInfo no devolvieron ningún error. FYI, mi plataforma es Windows 7 en una tarjeta Tesla C2050 GPU. –