Created
March 15, 2012 01:10
-
-
Save syadlowsky/2040953 to your computer and use it in GitHub Desktop.
CALLOC FREE weirdness
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<stdlib.h> | |
// number of floats that fit in L1 cache | |
#define L1_SIZE 1024 | |
#define L1_MATRIX_ALLOT 512 | |
// number of floats that fit in L2 cache | |
#define L2_SIZE 8192 | |
// number of floats that fit in L2 cache | |
#define L3_SIZE 8192 | |
void transpose( int n, int blocksize, float *dst, float *src ) { | |
int i,j,k,m; | |
/* TO DO: implement blocking (two more loops) */ | |
for ( k = 0; k < n; k+=blocksize ) { | |
for (m = 0; m < n; m+=blocksize ) { | |
if (k+blocksize-1 < n) { | |
if (m+blocksize-1 < n) { | |
for( i = 0; i < blocksize; i++ ) | |
for( j = 0; j < blocksize; j++ ) | |
dst[(j+m)+(i+k)*n] = src[(i+k)+(j+m)*n]; | |
} | |
else { | |
for( i = 0; i < blocksize; i++ ) | |
for( j = m; j < n; j++ ) | |
dst[j+(i+k)*n] = src[(i+k)+j*n]; | |
} | |
} | |
else { | |
if (m+blocksize-1 < n) { | |
for( i = k; i < n; i++ ) | |
for( j = 0; j < blocksize; j++ ) | |
dst[(j+m)+i*n] = src[i+(j+m)*n]; | |
} | |
else { | |
for( i = k; i < n; i++ ) | |
for( j = m; j < n; j++ ) | |
dst[j+i*n] = src[i+j*n]; | |
} | |
} | |
} | |
} | |
} | |
/* This routine performs a sgemm operation | |
* C := C + A * B | |
* where A, B, and C are lda-by-lda matrices stored in column-major format. | |
* On exit, A and B maintain their input values. */ | |
void square_sgemm (int n, float* A, float* B, float* C) | |
{ | |
float *A_trans = (float*)calloc(n, sizeof(float)); | |
printf("%d\n", A_trans); | |
transpose(n, 32, A_trans, A); | |
printf("%d\n", A_trans); | |
/* For each row i of A */ | |
for (int i = 0; i < n; ++i) { | |
/* For each column j of B */ | |
for (int j = 0; j < n; ++j) | |
{ | |
/* Compute C(i,j) */ | |
float cij = C[i+j*n]; | |
for( int k = 0; k < n; k++ ) | |
cij += A[i+k*n] * B[k+j*n]; | |
C[i+j*n] = cij; | |
} | |
} | |
free(A_trans); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment