/* File: gaussian_mixture.c * Author: David Squire (DMS), David.Squire@infotech.monash.edu.au * Date Created: 20050804 * Date Last Modified: 20050813 * - this version has all required functions in this single file, * so that users do not need to have my libraries, headers, etc. * * Purpose: This program generates a specified number of samples from a * probability distribution specified by a mixture of Gaussians. The number * of samples desired, and the parameters of the mixture, are specified in * a file that is either given on the command line, or read from stdin. The * results are written to stdout. * * The format of the input file is as follows: * * - numSamples (n) * - numDataDimensions (d) * - numGaussians (k) * - weight_1, weight_2, ..., weight_k # relative weights of Gaussians in * mixture * - k lines of means of each Gaussian, each specified as a line of d * comma-separated values * - d x d covariance matrices of each of the k Gaussians, each specified * as d lines of d comma-separated values. * * Note that lines beginning with '#' are ignored, as is anything after a * '#' on a line. * * Here is an example input file: * 100000 # number of samples 2 # number of dimensions 4 # number of Gaussians 0.4, 0.2, 0.2, 0.2 # weights of Gaussians # Means of Gaussians 0.3, 0.7 # Gaussian 1 -5, 5 # Gaussian 2 5, 5 # Gaussian 3 2, -3 # Gaussian 4 # Covariance matrices of Gaussians 7, 0 # Gaussian 1 0, 7 # 1, 0.5 # Gaussian 2 0.5, 5 # 1, -0.5 # Gaussian 3 -0.5, 1 # 5, 0 # Gaussian 4 0, 0.2 * * */ #include #include #include #include #include /*****************************************************************************/ /* From my file matrix_utils.c */ /*****************************************************************************/ /***************************************/ /* handy output routines for debugging */ void printVector(double *Vector, int rows) { int i; printf("[ "); for (i = 0; i < rows; i++) { printf("%16.6f", Vector[i]); } printf(" ]\n"); } void printMatrix(double **Matrix, int rows, int cols) { int i, j; for (i = 0; i < rows; i++) { printf("[ "); for (j = 0; j < cols; j++) { printf("%16.6f", Matrix[i][j]); } printf(" ]\n"); } } /***************************************/ /* Below are routines for creating the matrices that numrec wants */ double *dVector(int n) { double *v; v = (double *)calloc(n, sizeof(double)); return(v); } double **dMatrix(int nrows, int ncols) { double **m; int i; m = (double **)calloc(nrows, sizeof(double *)); for (i = 0; i < nrows; i++) m[i] = (double *)calloc(ncols, sizeof(double)); return(m); } void dFreeMatrix(double **m, int nrows, int ncols) { int i; for (i = 0; i < nrows; i++) free(m[i]); free(m); } /* multiplication of matrix by a vector */ double *dMultiplyVector(double **m, double *v, int nrows, int ncols) { double *product; int i, j; /* allocate space for the product */ product = dVector(nrows); /* calculate the product. Since dVector uses calloc, we don't need to * initialize */ for (i = 0; i < nrows; i++) { for (j = 0; j < ncols; j++) { product[i] += m[i][j]*v[j]; } } return product; } /*****************************************************************************/ /* From my file sq_utils.c */ /*****************************************************************************/ int strsplit(char *sin, char ***sout, char c) { /* Takes a string sin, and splits it at each occurence of character c. * The results of the split are placed in an array of strings, *sout, * and the number of results n + 1 is returned. No space is * allocated for sout if there are no occurences of c, in which case 0 * is returned. * Author: David Squire * Date: 20050804 */ int i, j, n, m; char *p, *buff; /* count the number of occurences of c in sin */ n = 0; p = sin; while (*p) { if (*p++ == c) { n++; } } if (n == 0) { return 0; } /* allocate space for the array of strings */ *sout = (char **)malloc((n + 1)*sizeof(char *)); /* extract each substring, store it in the array */ buff = (char *)malloc((1 + strlen(sin))*sizeof(char)); /* buffer */ p = sin; i = 0; while (*p) { m = 0; while (*p && (*p != c)) { /* copy characters into the buffer */ buff[m++] = *p++; /* until we see c, or end of string */ } /* allocate space for this substring, and copy it from the buffer */ (*sout)[i] = (char *)malloc((m + 1)*sizeof(char)); for (j = 0; j < m; j++) { (*sout)[i][j] = buff[j]; } (*sout)[i][j] = '\0'; /* terminate the string */ i++; if (*p) { p++; /* skip the splitting character */ } } free(buff); if (i != (n + 1)) { // the string must have had c as the final character (*sout)[n] = (char *)malloc(1*sizeof(char)); (*sout)[n][0] = '\0'; } return (n + 1); } int getline(FILE *f, char *buff, char skipchar, int MAXLINELENGTH) { /* Read a line from a file f, and store it in space buff, which must * already have been allocated memory of at least MAXLINELENGTH * characters. Lines beginning with skipchar are ignored, as is * anything after an occurence of skipchar on a line. Returns false if * nothing can be read. * Author: David Squire * Date: 20050805 * Last Modified: 20050807 */ char c; int i; buff[0] = skipchar; while ((buff[0] == skipchar) && !feof(f)) { i = 0; while (((c = fgetc(f)) != EOF) && (c != '\n') && (i < MAXLINELENGTH - 1)) { if (c != skipchar) { buff[i++] = c; } else { /* suck up the remainder of the line and discard it */ while (((c = fgetc(f)) != EOF) && (c != '\n')) { ; /* do nothing */ } ungetc(c, f); /* put the \n back for the outer loop */ } } } buff[i] = '\0'; /* terminate the string */ if (i == 0) { /* first read caused EOF, or couldn't find a line with content */ return 0; } if ((i == MAXLINELENGTH - 1) && (c != '\n')) { /* line was too long */ fprintf(stderr, "\007Warning: encountered line to long to read!\n"); /* suck up the remainder of the line and discard it */ while (((c = fgetc(f)) != EOF) && (c != '\n')) { ; /* do nothing */ } } return 1; } int line2vec(char *line, double *vec, int d) { /* parse a comma-separated line of d numbers into a vector of doubles. * Returns false if there is a problem. * Author: David Squire * Date: 20050807 */ char **s; /* array of strings, used for split results */ int i; if (strsplit(line, &s, ',') != d) { fprintf(stderr, "Incorrect number of elements on line '%s'.\n", line); return 0; } for (i = 0; i < d; i++) { vec[i] = atof(s[i]); free(s[i]); } free(s); return 1; } int *pick_n_from_weights(int n, double *w, int d) { /* Pick n integers from [0, d-1] according to probabilities specified * by weight vector w. * Author: David Squire * Date: 20050807 */ double weight_sum, *cum_probs, p; int i, j, *results; weight_sum = 0; cum_probs = (double *)malloc(d*sizeof(double)); for (i = 0; i < d; i++) { weight_sum += w[i]; } cum_probs[0] = w[0]/weight_sum; for (i = 1; i < d; i++) { cum_probs[i] = cum_probs[i - 1] + w[i]/weight_sum; } results = (int *)malloc(n*sizeof(int)); for (j = 0; j < n; j++) { p = drand48(); for (i = 0; i < d; i++) { if (cum_probs[i] > p) { break; } } results[j] = i; } free(cum_probs); return results; } /*****************************************************************************/ /* From my file jacobi.c */ /*****************************************************************************/ /* this is just the NumRec routine, changed to use doubles instead of floats */ /* DMS 960318: I have hacked it to do without the numrec library, and to use standard C matrix indices [0, n-1]. */ /* DMS: I really hate macros like this that depend on variable names that are not passed to them!! */ #define ROTATE(a,i,j,k,l) g=a[i][j];h=a[k][l];a[i][j]=g-s*(h+g*tau);\ a[k][l]=h+s*(g-h*tau); void jacobi(a,n,d,v,nrot) /* Computes all eigenvalues and eigenvectors of a real symmetric matrix a[0..n-1][0..n-1]. On output, elements of a above the diagonal are destroyed. d[0..n-1] returns the eigenvalues of a. v[0..n-1][0..n-1] is a matrix whose columns contain, on output, the normalized eigenvectors of a. nrot returns the number of Jacobi rotations that were required. */ double **a,d[],**v; int n,*nrot; { int j, iq, ip, i; double tresh, theta, tau, t, sm, s, h, g, c, *b, *z; b = dVector(n); z = dVector(n); /* initialise v to the identity matrix */ for (ip = 0; ip < n; ip++) { for (iq = 0; iq < n; iq++) v[ip][iq] = 0.0; v[ip][ip] = 1.0; } /* initialise b and d to the diagonal of a */ for (ip = 0; ip < n; ip++) { b[ip] = d[ip] = a[ip][ip]; z[ip] = 0.0; } *nrot = 0; for (i = 1; i <= 50; i++) { /* sum off-diagonal elements */ /* DMS: surely this is just the lower triangle? */ sm = 0.0; for (ip = 0; ip < n-1; ip++) { for (iq = ip+1; iq < n; iq++) sm += fabs(a[ip][iq]); } if (sm == 0.0) { /* The normal return, which relies on quadratic free(z); covergence to machine underflow. */ free(b); return; } if (i < 4) tresh = 0.2*sm/(n*n); /* On the first three sweeps... */ else tresh = 0.0; /* ...thereafter */ for (ip = 0; ip < n-1; ip++) { for (iq = ip+1; iq < n; iq++) { g = 100.0*fabs(a[ip][iq]); /* After four sweeps, skip the rotation if the off-diagonal element is small */ if (i > 4 && fabs(d[ip])+g == fabs(d[ip]) && fabs(d[iq])+g == fabs(d[iq])) a[ip][iq] = 0.0; else if (fabs(a[ip][iq]) > tresh) { h = d[iq]-d[ip]; if (fabs(h)+g == fabs(h)) t = (a[ip][iq])/h; else { theta = 0.5*h/(a[ip][iq]); t = 1.0/(fabs(theta)+sqrt(1.0+theta*theta)); if (theta < 0.0) t = -t; } c = 1.0/sqrt(1+t*t); s = t*c; tau = s/(1.0+c); h = t*a[ip][iq]; z[ip] -= h; z[iq] += h; d[ip] -= h; d[iq] += h; a[ip][iq] = 0.0; /* Case of rotations 0 <= j < p */ for (j = 0; j <= ip-1; j++) { ROTATE(a, j, ip, j, iq) } /* Case of rotations p < j < q */ for (j = ip+1; j <= iq-1; j++) { ROTATE(a, ip, j, j, iq) } /* Case of rotations q < j < n */ for (j = iq+1; j < n; j++) { ROTATE(a, ip, j, iq, j) } for (j = 0; j < n; j++) { ROTATE(v, j, ip, j, iq) } ++(*nrot); } } } for (ip = 0; ip < n; ip++) { b[ip] += z[ip]; d[ip] = b[ip]; z[ip] = 0.0; } } fprintf(stderr, "Too many iterations in routine JACOBI\n"); } #undef ROTATE /*****************************************************************************/ /* From my file gasdev.c */ /*****************************************************************************/ /* Adapted from Numerical Recipes in C (2nd Edition) code */ /* David Squire (DMS) 20050804 */ double gasdev(long *idum) { /* Returns a normally distributed deviate with zero mean and unit variance. * */ static int iset = 0; static double gset; double fac, rsq, v1, v2; if (*idum < 0) iset = 0; /* Reinitialize. */ if (iset == 0) { /* We don't have an extra deviate handy, so pick two uniform * numbers in the square extending from -1 to +1 in each direction, * see if they are in the unit circle, and if they are not, try * again. */ do { v1 = 2.0*drand48() - 1.0; v2 = 2.0*drand48() - 1.0; rsq = v1*v1 + v2*v2; } while (rsq >= 1.0 || rsq == 0.0); fac = sqrt(-2.0*log(rsq)/rsq); /* Now make the Box-Muller transformation to get two normal * deviates. Return one and save the other for next time. */ gset = v1*fac; iset = 1; /* Set flag. */ return v2*fac; } else { /* We have an extra deviate handy, */ iset=0; /* so unset the flag, */ return gset; /* and return it. */ } } double drandgaussian(double mean, double stddev) { /* Author: DMS * Date: 20050804 * Description: Returns a random sample from a Gaussian distribution with * mean and standard deviation as specified by the parameters. Really just * a wrapper for the Numerical Recipes gasdev above. */ static long idum = -1; return mean + stddev*gasdev(&idum); } /*****************************************************************************/ /* Beginning of my normal gaussian_mixture.c */ /*****************************************************************************/ int n; /* number of samples desired */ int d; /* number of data dimensions */ int k; /* number of Gaussians in the mixture */ double *w; /* array of weights of Gaussian components */ double **m; /* array of k means, each of d dimensions */ double ***S; /* array of k covariance matrices, each d x d */ double ***E; /* array of k matrices, each d x d, containing the * eigenvectors of each covariance matrix */ double **lambda; /* array of k vectors, each of d dimensions, containing the * eigenvalues of each covariance matrix */ #define MAXLINELENGTH 256 /* maximum line length accepted in input file */ void parseInput(FILE *infile) { char line[MAXLINELENGTH]; int i, j; char **s; /* array of strings, used for split results */ #ifdef DEBUG printf("Parsing input file...\n"); #endif if (!getline(infile, line, '#', MAXLINELENGTH)) { fprintf(stderr, "Error occurred when reading file.\n"); exit(1); } n = atoi(line); /* number of samples */ if (!getline(infile, line, '#', MAXLINELENGTH)) { fprintf(stderr, "Error occurred when reading file.\n"); exit(1); } d = atoi(line); /* number of dimensions */ if (!getline(infile, line, '#', MAXLINELENGTH)) { fprintf(stderr, "Error occurred when reading file.\n"); exit(1); } k = atoi(line); /* number of Gaussians */ #ifdef DEBUG printf("Want %d samples from %d %d-dimensional Gaussians\n", n, k, d); #endif /* allocate space for the weights of the Gaussians */ w = dVector(k); /* read in the weights of the Gaussians */ if (!getline(infile, line, '#', MAXLINELENGTH)) { fprintf(stderr, "Unexpected end of input file.\n"); exit(1); } if (!line2vec(line, w, k)) { fprintf(stderr, "Error extracting %d numbers from '%s'\n", k, line); exit(1); } #ifdef DEBUG printf("Weights: \n"); printVector(w, k); #endif /* allocate space for d-dimensional means of the Gaussians */ m = (double **)malloc(k*sizeof(double *)); for (i = 0; i < k; i++) { m[i] = dVector(d); } /* read in the means of the Gaussians */ for (i = 0; i < k; i++) { if (!getline(infile, line, '#', MAXLINELENGTH)) { fprintf(stderr, "Unexpected end of input file.\n"); exit(1); } if (!line2vec(line, m[i], d)) { fprintf(stderr, "Error extracting %d numbers from '%s'\n", d, line); exit(1); } } #ifdef DEBUG printf("Means: \n"); for (i = 0; i < k; i++) { printf("Gaussian %d:\n", i); printVector(m[i], d); } #endif /* allocate space for the k covariance matrices of the Gaussians, each * d x d */ S = (double ***)malloc(k*sizeof(double **)); for (i = 0; i < k; i++) { S[i] = dMatrix(d, d); } /* read in the covariance matrices */ /* TODO In a perfect world, we would check that the covariance matrices * are symmetrical, and thus valid - better yet, we would have an input * file format that enforced it (e.g. just specify upper triangle of * the matrix). There are also other constraints that should be checked, * e.g. covariances (off-diagonal) that are too big for the variances * (on-diagonal). */ for (i = 0; i < k; i++) { /* foreach Gaussian */ for (j = 0; j < d; j++) { /* d lines of numbers each */ if (!getline(infile, line, '#', MAXLINELENGTH)) { fprintf(stderr, "Unexpected end of input file.\n"); exit(1); } if (!line2vec(line, S[i][j], d)) { fprintf(stderr, "Error extracting %d numbers from '%s'\n", d, line); exit(1); } } } #ifdef DEBUG printf("Covariance matrices: \n"); for (i = 0; i < k; i++) { printf("Gaussian %d:\n", i); printMatrix(S[i], d, d); } printf("\n"); #endif } int main(int argc, char *argv[]) { FILE *infile; int i, j, *g; int nrot; /* needed by eigsrt. Not used in this program */ double *pa_sample, *sample; /* temp. storage for generated sample */ srand48(getpid()); /* seed random number generator */ /* check if an input file has been specified, if not use stdin */ switch(argc) { case 1: infile = stdin; break; case 2: if (infile = fopen(argv[1], "r")) { break; } else { fprintf(stderr, "Can not open %s for reading. Bye.\n", argv[1]); exit(1); } default: fprintf(stderr, "Usage: %s [inputfile]\n", argv[0]); exit(1); } /* get the number of samples desired, and the parameters of the * Gaussian mixture */ parseInput(infile); /* Now, we need to find the eigenvectors and eigenvalues of the * covariance matrix of each Gaussian. The orthonormal matrix of * eigenvectors will give us the rotation transformation from the * principal axes of the hyperellipsoid corresponding to the Gaussian * back to the original coordinate system, and the eigenvalues will * give us the variance along each principal axis. So, the procedure to * generate the samples will be to generate them in the coordinate * system defined by the principal axes, with appropriate variances, * then rotate back to the original coordinate system, and finally * shift by the means. */ /* allocate space for eigenvectors and eigenvalues */ E = (double ***)malloc(k*sizeof(double **)); lambda = (double **)malloc(k*sizeof(double *)); for (i = 0; i < k; i++) { E[i] = dMatrix(d, d); lambda[i] = dVector(d); } /* compute the eigenvectors and eigenvalues */ for (i = 0; i < k; i++) { /* note that jacobi DESTROYS its input matrix */ jacobi(S[i], d, lambda[i], E[i], &nrot); #ifdef DEBUG printf("Gaussian %d:\n", i); printf("Eigenvalues:\n"); printVector(lambda[i], d); printf("Eigenvectors:\n"); printMatrix(E[i], d, d); printf("\n"); #endif /* take square roots of eigenvalues, since we actually want standard * deviations, not variances */ for (j = 0; j < d; j++) { lambda[i][j] = sqrt(lambda[i][j]); } } /* generate the samples */ pa_sample = dVector(d); /* get list of indices to select Gaussians, according to probabilities * given by the weights (getting n at a time is faster than doing it in * the loop). */ g = pick_n_from_weights(n, w, k); for (i = 0; i < n; i++) { /* generate sample in principal axes space */ for (j = 0; j < d; j++) { pa_sample[j] = drandgaussian(0, lambda[g[i]][j]); } /* rotate from principal axes space back to normal space */ sample = dMultiplyVector(E[g[i]], pa_sample, d, d); /* shift to the means */ for (j = 0; j < d; j++) { sample[j] += m[g[i]][j]; } /* print out the sample */ for (j = 0; j < d - 1; j++) { printf("%f, ", sample[j]); } printf("%f\n", sample[j]); free(sample); } }