/*
 * CSSE, Monash University, Clayton, Victoria 3800, Australia
 * CSE2304 Prac 4, Group B
 * Semester 1, May 2003  -- O.R.
 * Sample Solution
 *
 * wurts [-v] [-c|-C] [fileName]
 *   Outputs an adjacency matrix of the 'distance' from one source text to
 *   another, measured as the effectiveness of coding the wurts of each text
 *   concatenated with each other text.
 *   A wurt is defined as one of:
 *       a sequence of digits
 *       a sequence of alphanumeric characters beginning with a letter
 *       a sequence of non-alphanumeric printable ASCII characters
 *   All other characters are ignored except to indicate wurt boundaries.
 */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <ctype.h>
#include <math.h>

/* Increment for reading (up to INC characters before realloc) */
#define INC 10

/* Used to indicate a 'missing value' in the adjacency matrix */
#define EMPTY -1

/* Self-explanatory */
#define log2(x) (log(x)/log(2))
#define max(x,y) (x>y?x:y)
#define min(x,y) (x<y?x:y)

/* Maximum file length for which to output a representation of the
 * compressed version */
#define MAX_REP_LEN 100

/* Tree node */
typedef struct TNode {
	struct TNode *left, *right;
	char *key;
	int count;
} Node;

/* Abstract "wurts" data structure */
typedef struct {
	Node *root;
	int cs; /* case sensitive */
	int wurtCount;
} WurtsDS;

/* killTree
 * Recursively frees a tree and its contents.
 */
void killTree(Node *cur) {
	if(cur) {
		/* Node exists. Destroy subtrees. */
		killTree(cur->left);
		killTree(cur->right);
		/* Free data */
		free(cur->key);
		/* Finished; free self. */
		free(cur);
	}
}

/* wurtsFreq1
 * Helper function for wurtsFreq: tree search to return the frequency of the
 * supplied wurt s.
 */
int wurtFreq1(char *s, Node *cur, int cs) {
	int cmp;
	if(!cur)
		return 0;
	/* Node exists, compare key value */
	cmp = (cs?strcmp:strcasecmp)(s, cur->key);
	if(cmp < 0)
		/* Key is greater than wurt, go left */
		return wurtFreq1(s, cur->left, cs);
	else if(cmp > 0)
		/* Key is less than wurt, go right */
		return wurtFreq1(s, cur->right, cs);
	/* Found the wurt. */
	return cur->count;
}

/* wurtsFreq
 * Returns the frequency of the provided wurt in a wurts data structure.
 */
int wurtFreq(char *wurt, WurtsDS *w) {
	/* Call helper function */
	return wurtFreq1(wurt, w->root, w->cs);
}

/* wurtProb
 * Returns the probability of a particular wurt in a wurts data structure.
 */
double wurtProb(char *wurt, WurtsDS *w) {
	/* Call helper function. *1.0 to use floating-point maths. */
	return 1.0 * wurtFreq1(wurt, w->root, w->cs) / w->wurtCount;
}

/* incWurtsCount1
 * Helper function for incWurtsCount: binary search tree insertion/update to
 * increment the frequency count of the provided wurt. The parameter cs
 * indicates case sensitivity. Returns the current node or NULL on failure.
 * Note that if something goes wrong, the operation is aborted and the tree
 * is unrecoverably destroyed; memory is not freed.
 */
Node *incWurtsCount1(char *wurt, Node *cur, int cs) {
	int cmp;
	if(cur) {
		/* Node exists, compare key value */
		cmp = (cs?strcmp:strcasecmp)(wurt, cur->key);
		if(cmp < 0) {
			/* Key is greater than wurt, go left */
			cur->left = incWurtsCount1(wurt, cur->left, cs);
		} else if(cmp > 0) {
			/* Key is less than wurt, go right */
			cur->right = incWurtsCount1(wurt, cur->right, cs);
		} else {
			/* Found the appropriate key. Increment count. */
			cur->count++;
		}
	} else {
		/* Empty subtree found, insert wurt as new node. */
		cur = (Node *)malloc(sizeof(Node));
		if(!cur) return NULL; /* Memory allocation failure */
		cur->key = strdup(wurt);
		if(!cur->key) {
			/* Memory allocation failure, clean up and abort */
			free(cur);
			return NULL;
		}
		/* No children */
		cur->left = NULL;
		cur->right = NULL;
		/* Single occurrence */
		cur->count = 1;
	}
	/* Pass node up the tree */
	return cur;
}

/* incWurtsCount
 * Increments the frequency count of the provided wurt. Returns 1 if
 * successful, 0 otherwise.
 */
int incWurtsCount(char *wurt, WurtsDS *w) {
	/* Call helper function */
	w->root = incWurtsCount1(wurt, w->root, w->cs);
	if(w->root)
		/* Added successfully; increment total */
		w->wurtCount++;
	return w->root != NULL;
}

/* initWurtsDS
 * Sets up the data structure used to store the wurts. Returns 1 if
 * successful, 0 otherwise.
 */
int initWurtsDS(WurtsDS *w, int caseSensitive) {
	/* No root node */
	w->root = NULL;
	/* Remember case sensitivity */
	w->cs = caseSensitive;
	/* No entries yet */
	w->wurtCount = 0;
	return 1;
}

/* destroyWurtsDS
 * Cleans up the data structure used to store the wurts. Returns 1 if
 * successful, 0 otherwise.
 */
int destroyWurtsDS(WurtsDS *w) {
	/* Destroy tree and its data */
	killTree(w->root);
	return 1;
}

/* resetWurtsDS
 * Reinitialises the data structure used to store the wurts. Maintains case
 * sensitivity setting. Does not cause any memory allocation failures.
 */
int resetWurtsDS(WurtsDS *w) {
	/* Destroy tree and its data */
	killTree(w->root);
	w->root = NULL;
	w->wurtCount = 0;
	return 1;
}

/* matrix
 * Allocates space for an arbitrary two-dimensional array.
 * Each element is initialised to 0.
 * Returns NULL on failure.
 * Memory must later be freed using freeMatrix.
 */
void **matrix(size_t size, int rows, int cols) {
	void **t;
	int i;

	/* Require positive input */
	assert(size > 0 && rows > 0 && cols > 0);

	/* Allocate first dimension (rows) */
	if(!(t = calloc(sizeof(void *), rows)))
		/* Handle failure */
		return NULL;

	/* Allocate each row */
	for(i=0; i<rows; i++) {
		if(!(t[i] = calloc(size, cols))) {
			/* Unwind, clean up */
			while(--i)
				free(t[i]);
			free(t);
			return NULL;
		}
		/* Clear contents of row */
		memset(t[i], 0, size * cols);
	}
	return t;
}

/* freeMatrix
 * Frees memory allocated for a matrix t.
 */
void freeMatrix(void **t, int rows) {
	int i;
	if(!t)
		return;

	/* Free each row */
	for(i=0; i<rows; i++)
		free(t[i]);

	/* Free final pointer */
	free(t);
}

/* getNextWurt
 * Reads the next wurt from the provided input stream (source). Returns a
 * dynamically allocated string containing the wurt, or NULL to indicate EOF
 * or error. Caller must free the returned wurt.
 */
char *getNextWurt(FILE *source) {
	/* Allocate one "block" of memory */
	char *t = (char *)malloc(sizeof(char) * INC);
	int i = 0, c, state;

	if(!t)
		/* Out of memory */
		return NULL;

	while(isspace(c = getc(source)) || c > 127)
		; /* eat irrelevant characters */

	/* c contains the first character of our wurt. */

	if     (isdigit(c)) state = 1; /* number     */
	else if(isalpha(c)) state = 2; /* identifier */
	else                state = 3; /* symbol     */

	while(c != EOF) {
		if((state == 1 && !isdigit(c))
		|| (state == 2 && !isalnum(c))
		|| (state == 3 && (!ispunct(c) || c > 127)))
			/* Found unwanted character, finished reading. */
			break;

		t[i++] = c; /* Add current character to string */
		if(i % INC == 0) {
			/* Out of space, expand string */
			t = (char *)realloc(t, sizeof(char) * (i + INC));
			if(!t)
				/* Out of memory */
				return NULL;
		}
		/* Get next character */
		c = getc(source);
	}

	/* Pretend we didn't just read the next character */
	ungetc(c, source);

	if(!i) {
		/* String length is zero, EOF. Clean up. */
		free(t);
		return NULL;
	}

	/* Null-terminate and return string. */
	t[i] = 0;
	return t;
}

/* outputAdjacencyMatrix
 * Prints to stdout the adjacency matrix adj using reasonably well-formatted
 * output. size is the length of both dimensions of the matrix and the
 * number of labels.
 */
void outputAdjacencyMatrix(double **adj, int size, char **labels) {
	int i, j, p, maxlen=7;

	/* Calculate max label length */
	for(i=0; i<size; i++)
		maxlen = max(maxlen, strlen(labels[i]));

	/* Output column headings */
	printf("%*s", maxlen, "from\\to");
	for(i=0; i<size; i++) {
		/* Assume 'empty' */
		p = 0;
		/* Determine 'empty' */
		for(j=0; j<size; j++)
			if(adj[i][j] != EMPTY)
				p = 1;
		/* Skip 'empty' */
		if(p)
			printf(" | %-*s",
				max(min(10, maxlen), strlen(labels[i])),
				labels[i]);
	}
	printf(" |\n");

	/* Output rows */
	for(i=0; i<size; i++) {
		/* Assume 'empty' */
		p = 0;
		/* Determine 'empty' */
		for(j=0; j<size; j++)
			if(adj[i][j] != EMPTY)
				p = 1;
		/* Skip 'empty' */
		if(!p)
			continue;
		printf("%-*s", maxlen, labels[i]);
		for(j=0; j<size; j++) {
			if(adj[i][j] == EMPTY)
				continue;
			printf(" | %*.4f",
				max(min(10, maxlen), strlen(labels[j])),
				adj[i][j]);
			if(adj[i][j] < 0)
				printf("![%d][%d]!", i, j);
		}
		printf(" |\n");
	}
}

/* bodyRepresentation
 * Outputs a schematic representation of the encoding of file in, which must
 * be open for reading, 'compressed' using the wurts data structure w.
 */
void bodyRepresentation(WurtsDS *w, FILE *in) {
	int mode = -1;
	char *s;
	while((s = getNextWurt(in))) {
		/* Check wurt's frequency */
		if(wurtFreq(s, w) > 1) {
			/* Worth compressing. */
			if(mode == 1)
				/* Last output was an inline wurt;
				 * requires end of string byte */
				printf("\033[1;30m<eos>\033[0m");
			/* Print codeword length */
			printf("[%.2g]", -log2(wurtProb(s, w)));
			/* Remember mode */
			mode = 0;
		} else {
			/* Not worth compressing, print inline. */
			if(mode != -1)
				/* Not first output; requires end of string
				 * byte */
				printf("\033[1;30m<eos>\033[0m");
			/* Print wurt */
			printf("\033[1;37m%s\033[0m", s);
			/* Remember mode */
			mode = 1;
		}
		free(s);
	}
	printf("\n\n");
}

/* bodySize
 * Returns the theoretical compressed size of the input file(s), open for
 * reading, based on the wurts data structure w. If only one file is required
 * in2 must be set to NULL.
 */
int bodySize(WurtsDS *w, FILE *in1, FILE *in2) {
	int c = 0;       /* compressed length */
	double part = 0; /* codeword run partial length */
	char *s;         /* current wurt */

	/* in1 must not be NULL */
	assert(in1);

	/* For the first file: */
	while((s = getNextWurt(in1))) {
		/* Check wurt's frequency */
		if(wurtFreq(s, w) > 1) {
			/* Worth compressing. */
			/* Remember codeword length in bits */
			part -= log2(wurtProb(s, w));
		} else {
			/* Not worth compressing, print inline. */
			if(part > 0)
				/* Preceded by codewords, add length
				 * in bytes */
				c += ceil(part/8) + 1;
			/* Add wurt length and end of string byte */
			c += strlen(s) + 1;
			/* Clear cumulated codeword lengths */
			part = 0;
		}
		free(s);
	}
	if(in2) {
		/* For the second file: */
		while((s = getNextWurt(in2))) {
			/* Repeat all of the above */
			if(wurtFreq(s, w) > 1) {
				part -= log2(wurtProb(s, w));
			} else {
				if(part > 0)
					c += ceil(part/8) + 1;
				c += strlen(s) + 1;
				part = 0;
			}
			free(s);
		}
	}
	if(part > 0)
		/* Last wurt was encoded; add run length */
		c += ceil(part/8);
	else if(c>0)
		/* Last was inline; remove redundant end of string byte */
		c--;
	/* Return calculated compressed length */
	return c;
}

/* headerSize1
 * Helper function for headerSize. Returns the size of header contributed by
 * a subtree of the wurts data structure, calculated recursively.
 */
int headerSize1(Node *cur) {
	int val = 0, t;
	if(!cur)
		/* Node does not exist, no contribution */
		return 0;
	if(cur->count > 1) {
		/* This node has a frequency higher than 1; the key is worth
		 * compressing */
		val = strlen(cur->key) + 1; /* Plus 1 for end of string */
		if(isdigit(*(cur->key)))
			/* Number wurts require a separator to disambiguate
			 * them from the frequency count */
			val++;
		t = cur->count;
		/* Determine and add the number of digits in the wurt's
		 * frequency count */
		while(t>0) {
			val++;
			t /= 10;
		}
	}
	/* Add current node to header size of subtrees and return */
	return val + headerSize1(cur->left) + headerSize1(cur->right);
}

/* headerSize
 * Returns the size of a header for texts represented by the frequencies of
 * wurts in the supplied data structure.
 */
int headerSize(WurtsDS *w) {
	return headerSize1(w->root) + 1;
}

/* fillMatrix
 * Returns the value to be inserted at position [i][j] in the adjacency
 * matrix. Requires the file in is open for reading, to be concatenated
 * with filenames[j] for compression. Also populates len[i] and compressed[i]
 * as appropriate.
 * Note: this function is designed for convenience, not reuse.
 */
double fillMatrix(int i, int j, FILE *in, char *filenames[], WurtsDS *wp,
		int *len, int *compressed, int verbose) {
	FILE *in2;
	char *s;
	int l, t, val;

	if(!*filenames[j]) {
		/* Filename is empty (indicative of
		 * read error). Skip. */
		return EMPTY;
	}
	if(!(in2 = fopen(filenames[j], "r"))) {
		/* Can't open file. Skip. */
		return EMPTY;
	}
	/* Clear structure containing wurt frequencies */
	resetWurtsDS(wp);
	/* Go back to start of file 1 */
	rewind(in);
	l = 0; /* Clear length counter */
	/* Repopulate data structure with file 1 */
	while((s = getNextWurt(in))) {
		/* Remember and free current wurt */
		if(!incWurtsCount(s, wp))
			/* Out of memory */
			return 0;
		l += strlen(s); /* Add to length */
		free(s);
	}
	if(!compressed[i]) {
		/* Have not already stored compressed length of current file;
		 * calculate and remember. */

		/* Get file length */
		len[i] = l;
		/* Get header size */
		t = headerSize(wp);
		/* Compressed length is header size plus body size */
		rewind(in);
		compressed[i] = t + bodySize(wp, in, NULL);
		
		if(verbose) {
			/* Print useful file info */
			printf("'%s': length %d, compressed %d (%d+%d)\n",
					filenames[i], len[i],
					compressed[i], t, compressed[i]-t);
			if(len[i] < MAX_REP_LEN) {
				/* Short file, output compressed body
				 * representation */
				rewind(in);
				bodyRepresentation(wp, in);
			}
		}
	}
	/* Add contents of file 2 to the wurts data structure */
	while((s = getNextWurt(in2))) {
		/* Remember and free current wurt */
		if(!incWurtsCount(s, wp))
			/* Out of memory */
			return 0;
		free(s);
	}
	/* Back to the start of both files */
	rewind(in);
	rewind(in2);
	/* Calculate full compressed length of concatenated files */
	val = headerSize(wp) + bodySize(wp, in, in2);
	/* Finished with file 2 */
	fclose(in2);
	/* Return matrix value */
	return val;
}

/* getAdjacencyMatrix
 * Determines and outputs the adjacency matrix of source text distances by
 * determining the normalised theoretical compressed length of texts using
 * a sub-optimal encoding technique. Compares all files against each other.
 * File errors are non-fatal.
 * cs represents required case sensitivity (nonzero = case sensitive).
 * verbose indicates output level (0 = normal, 1 = verbose).
 * Returns 0 on success, 1 on allocation failure.
 */
int getAdjacencyMatrix(double **adj, int cs, int numfiles,
		char *filenames[], int verbose) {
	WurtsDS wurt;
	FILE *in = NULL;
	int i, j;
	int *len = NULL, *compressed = NULL;

	assert(numfiles > 0);

	/* Set up */

	/* Space to store file lengths */
	len = (int *)calloc(sizeof(int), numfiles);
	if(!len)
		/* Couldn't allocate space for lengths */
		return 0; /* failure */

	/* Space to store file lengths */
	compressed = (int *)calloc(sizeof(int), numfiles);
	if(!compressed) {
		/* Couldn't allocate space for compressed lengths */
		free(len);
		return 0; /* failure */
	}

	if(!initWurtsDS(&wurt, cs)) {
		/* Couldn't allocate space for data structure, clean up */
		free(compressed);
		free(len);
		return 0; /* failure */
	}
	
	/* For each file as a model: */
	for(i=0; i<numfiles; i++) {
		if(filenames[i][0] == '\0') {
			/* Empty filename indicates previous error */
			for(j=0; j<numfiles; j++)
				adj[i][j] = EMPTY;
			continue;
		}
		/* Open current file */
		if(!(in = fopen(filenames[i], "r"))) {
			fprintf(stderr,
				"warning: can't open file '%s'\n",
				filenames[i]);

			/* Indicate problem so file is not opened later */
			filenames[i][0] = '\0';
			/* Clear relevant entries in matrix */
			for(j=0; j<numfiles; j++)
				adj[i][j] = EMPTY;
		} else {
			for(j=0; j<numfiles; j++)
				/* Messy function call to populate
				 * appropriate matrix value */
				adj[i][j] = fillMatrix(i, j, in, filenames,
						&wurt, len, compressed,
						verbose);
			/* Finished with current file */
			fclose(in);
		}
	}

	/* Calculate distances */
	for(i=0; i<numfiles; i++)
		for(j=0; j<numfiles; j++)
			if(adj[i][j] != EMPTY && len[j] > 0) {
				/* For non-empty values, calculate normalised
				 * compression ratio. */
				adj[i][j] =
					(adj[i][j] - compressed[i]) / len[j];
			} else if(adj[i][j] != EMPTY) {
				/* Zero-length file. */
				adj[i][j] = 1e99999; /* really really big */
			}

	/* Clean up */
	free(compressed);
	free(len);
	destroyWurtsDS(&wurt);
	return 1;
}

/* main
 * Error-checks parameters and handles error messages.
 */
int main(int argc, char *argv[]) {
	int cs = 1, v = 0;
	double **adj;
	char *name = *argv;

	/* Skip program name. */
	argv++; argc--;

	/* Check for command-line options. */
	if(argc && !strcmp(*argv, "-v")) {
		v = 1; /* verbose */
		argv++; argc--;
	}
	if(argc) {
		if(!strcmp(*argv, "-c")) {
			cs = 0;     /* insensitive */
			argv++; argc--;
		} else if(!strcmp(*argv, "-C")) {
			argv++; argc--; /* default */
		} else if(!strcmp(*argv, "-v")) {
			v = 1; /* verbose */
			argv++; argc--;
		}
	}

	/* Check for filenames. */
	if(!argc) {
		/* Not enough; output a usage message and quit. */
		fprintf(stderr, "usage: %s [-v] [-c|-C] file1 file2 ...\n", name);
		return 1;
	}

	/* Set up adjacency matrix */
	adj = (double **)matrix(sizeof(double), argc, argc);
	if(!adj) {
		/* Couldn't allocate space for adjacency matrix */
		fprintf(stderr, "error: could not allocate memory\n");
		return 4;
	}

	/* Call the getAdjacencyMatrix function to do all the dirty work. */
	if(!getAdjacencyMatrix(adj, cs, argc, argv, v)) {
		/* Something went wrong allocating memory; quit. */
		fprintf(stderr, "error: could not allocate memory\n");
	}

	outputAdjacencyMatrix(adj, argc, argv);

	freeMatrix((void **)adj, argc);

	/* Finished. */
	return 0;
}