¿Por qué CudaFree no parece liberar memoria?

Estoy tratando de asignar la memoria del dispositivo, copiarlo, realizar los cálculos en la GPU, copiar los resultados y luego liberar la memoria del dispositivo que asigné. Quería asegurarme de no sobrepasar el límite y quería ver si tendría suficiente memoria en el espacio de memoria compartida para volcar algunas matrices.¿Por qué CudaFree no parece liberar memoria?

Cuando asigno la memoria del dispositivo, no se devuelve ningún error. Cuando uso cudaMemGetInfo para verificar la cantidad de memoria asignada, parece que cudaMalloc no ha asignado ninguna memoria. Además, cuando intento liberar la memoria, parece que solo se libera un puntero.

Estoy utilizando la interfaz matlab Mexfunction para configurar la memoria de la GPU y ejecutar el kernel. En este punto, ni siquiera estoy llamando al kernel y simplemente regreso una matriz de unidades para los resultados.

cudaError_t cudaErr; 
size_t freeMem = 0; 
size_t totalMem = 0; 
size_t allocMem = 0; 
cudaMemGetInfo(&freeMem, &totalMem); 
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem); 

/* Pointers for the device memory */ 
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers; 
double *deviceReceivedReal, *deviceReceivedImag; 

/* Allocate memory on the device for the arrays. */ 
mexPrintf("Allocating memory.\n"); 
cudaErr = cudaMalloc((void **) &devicePulseDelay, sizeof(double)*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to devicePulseDelay\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceTarDistance, sizeof(double)*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceTarDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceScattDistance, sizeof(double)*999*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceScatterers, sizeof(double)*999); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceScatterers\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceReceivedReal, sizeof(double)*999*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceReceivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMalloc((void **) &deviceReceivedImag, sizeof(double)*999*512); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not allocate memory to deviceReceivedImag\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem)); 

/* copy the input arrays across to the device */ 
mexPrintf("\nCopying memory.\n"); 
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to devicePulseDelay\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to deviceTarDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to deviceScatterers\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 

/* call the kernel */ 
// launchKernel<<<1,512>>>(........); 

/* retireve the output */ 
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to receivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could not copy to receivedImag\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); 

/* free the memory. */ 
mexPrintf("\nFree'ing memory.\n"); 
cudaMemGetInfo(&freeMem, &totalMem); 
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem); 
cudaErr = cudaFree(devicePulseDelay); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free devicePulseDelay\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceTarDistance); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceTarDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceScattDistance); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceScatterers); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceScatterers\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceReceivedReal); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceReceivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); 
cudaErr = cudaFree(deviceReceivedImag); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceReceivedImag\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); 
} 
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

Aquí está la salida de esta:

 
Memory avaliable: Free: 2523959296, Total: 2818572288 
Allocating memory. 
devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 
deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 
deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 
deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 
deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 
deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 

Copying memory. 
devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 
receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 

Free'ing memory. 
Before freeing: Free 2513473536, Total: 2818572288 
devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 
deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 
deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 
deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 
deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 
deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576

Siento que hay algo obvio que me falta. ¿Alguien puede ayudar a explicar lo que está pasando?

EDIT: plataforma es Windows 7 con una tarjeta Tesla C2050 GPu.

Fuente

2012-05-01 Beau Bellamy

¿En qué plataforma está ejecutando este código? – talonmies

Intenta poner a cero los valores de allocMem y totalMem antes de cada llamada a cudaMemGetInfo() y comprueba el valor de retorno de cudaMemGetInfo(). –

poniendo a cero el allocMem y totalMem antes de cada llamada a cudaMemGetInfo() no hizo ninguna diferencia. Además, las llamadas cudaMemGetInfo no devolvieron ningún error. FYI, mi plataforma es Windows 7 en una tarjeta Tesla C2050 GPU. –

Es una idea errónea muy común que malloc recibe directamente las asignaciones de memoria del sistema operativo host cuando se llama, y free las libera directamente al host que está operando cuando se le llama. Pero casi siempre no funcionan así, en su lugar la biblioteca estándar mantiene una lista circular de memoria libre y mallocida que se expande y contrae de forma oportunista al interactuar con el sistema operativo anfitrión (ver algunas de las respuestas en How do malloc() and free() work? para más información). detalles si estás interesado). Independientemente de cómo funciona, esto conduce a una serie de resultados no intuitivos, incluido el hecho de que generalmente es imposible asignar tanta memoria como el sistema operativo dice que es gratuita, que las asignaciones a veces parecen no cambiar la cantidad de memoria libre, y que free a veces no tiene ningún efecto sobre la cantidad de memoria que el sistema operativo dice que es gratuita.

Aunque no tengo más que evidencia empírica para apoyar esto, creo que CUDA funciona exactamente de la misma manera. El contexto mantiene su propia lista de memoria mallocida y libre, y expandirá y contraerá la memoria contenida en esa lista como controlador de host/administrador de ventana y la propia GPU lo permite. Todo el hardware tiene un tamaño de página MMU característico, y hay evidencia que sugiere que el tamaño de la página en las GPU NVIDIA es bastante grande. Esto implica que hay una granularidad bastante gruesa en las llamadas cudaMalloc, y significa que a veces un malloc no afecta la cantidad de memoria libre o consume mucha más memoria que la solicitada, y algunas veces las llamadas free parecen no tener ningún efecto (si está interesado, puede encontrar una pequeña herramienta que ayuda a ilustrar el comportamiento del tamaño de la página del controlador CUDA here, aunque fue escrita para una versión anterior de la API CUDA y podría necesitar un par de cambios para compilar con las versiones modernas). Creo que esta es la explicación más probable del comportamiento que estás observando.

Por cierto, si ejecuto una versión simplificada del código que publicó en MacOS 10.6 con un dispositivo de familia GT200:

#include <cstdio> 

#define mexPrintf printf 

inline void gpuAssert(cudaError_t code, char *file, int line, 
       bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     mexPrintf("GPUassert: %s %s %d\n", cudaGetErrorString(code), 
      file, line); 
     if (abort) exit(code); 
    } 
} 

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 

inline void gpuMemReport(size_t * avail, size_t * total, 
     const char * title = 0, const size_t * free = 0, const bool sense = true) 
{ 
    char tstring[32] = { '\0' }; 
    gpuErrchk(cudaMemGetInfo(avail, total)); 

    if (free) { 
     if (title) { 
      strncpy(tstring, title, 31); 
     } 
     mexPrintf("%s Memory avaliable: Free: %zu, Total: %zu, %s: %zu\n", 
       tstring, *avail, *total, (sense) ? "Allocated\0" : "Freed\0", 
       (sense) ? (*free - *avail) : (*avail - *free)); 
    } else { 
     mexPrintf("Memory avaliable: Free: %zu, Total: %zu\n", *avail, *total); 
    } 
} 

int main() 
{ 
    size_t freeMem = 0; 
    size_t totalMem = 0; 
    size_t allocMem = 0; 

    gpuErrchk(cudaFree(0)); 
    gpuMemReport(&freeMem, &totalMem); 

    double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers; 
    double *deviceReceivedReal, *deviceReceivedImag; 

    mexPrintf("Allocating memory.\n"); 
    gpuErrchk(cudaMalloc((void **) &devicePulseDelay, sizeof(double)*512)); 
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceTarDistance, sizeof(double)*512)); 
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceScattDistance, sizeof(double)*999*512)); 
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceScatterers, sizeof(double)*999)); 
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceReceivedReal, sizeof(double)*999*512)); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem); 

    gpuErrchk(cudaMalloc((void **) &deviceReceivedImag, sizeof(double)*999*512)); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem); 

    mexPrintf("\nFree'ing memory.\n"); 
    gpuMemReport(&freeMem, &totalMem); 

    gpuErrchk(cudaFree(devicePulseDelay)); 
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceTarDistance)); 
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceScattDistance)); 
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceScatterers)); 
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceReceivedReal)); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem, false); 

    gpuErrchk(cudaFree(deviceReceivedImag)); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem, false); 

    return 0; 
}

puedo obtener un resultado diferente, pero también uno que muestra el mismo fenómeno:

Allocating memory. 
devicePulseDelay: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576 
deviceTarDistance: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576 
deviceScattDistance: Memory avaliable: Free: 198778880, Total: 265027584, Allocated: 5140480 
deviceScatterers: Memory avaliable: Free: 197730304, Total: 265027584, Allocated: 6189056 
deviceReceivedReal: Memory avaliable: Free: 193638400, Total: 265027584, Allocated: 10280960 
deviceReceivedImag: Memory avaliable: Free: 189546496, Total: 265027584, Allocated: 14372864 

Free'ing memory. 
Memory avaliable: Free: 189546496, Total: 265027584 
devicePulseDelay: Memory avaliable: Free: 189546496, Total: 265027584, Freed: 0 
deviceTarDistance: Memory avaliable: Free: 190595072, Total: 265027584, Freed: 1048576 
deviceScattDistance: Memory avaliable: Free: 194686976, Total: 265027584, Freed: 5140480 
deviceScatterers: Memory avaliable: Free: 195735552, Total: 265027584, Freed: 6189056 
deviceReceivedReal: Memory avaliable: Free: 199827456, Total: 265027584, Freed: 10280960 
deviceReceivedImag: Memory avaliable: Free: 203919360, Total: 265027584, Freed: 14372864

Lo que sugiere que el comportamiento es el hardware OS/host depende también.

Fuente

2012-05-02 07:17:10 talonmies

¿Por qué CudaFree no parece liberar memoria?

Respuesta

Cuestiones relacionadas