/* * CSSE, Monash University, Clayton, Victoria 3800, Australia * CSE2304 Prac 4, Group B * Semester 1, May 2003 -- O.R. * Sample Solution * * wurts [-v] [-c|-C] [fileName] * Outputs an adjacency matrix of the 'distance' from one source text to * another, measured as the effectiveness of coding the wurts of each text * concatenated with each other text. * A wurt is defined as one of: * a sequence of digits * a sequence of alphanumeric characters beginning with a letter * a sequence of non-alphanumeric printable ASCII characters * All other characters are ignored except to indicate wurt boundaries. */ #include #include #include #include #include #include /* Increment for reading (up to INC characters before realloc) */ #define INC 10 /* Used to indicate a 'missing value' in the adjacency matrix */ #define EMPTY -1 /* Self-explanatory */ #define log2(x) (log(x)/log(2)) #define max(x,y) (x>y?x:y) #define min(x,y) (xleft); killTree(cur->right); /* Free data */ free(cur->key); /* Finished; free self. */ free(cur); } } /* wurtsFreq1 * Helper function for wurtsFreq: tree search to return the frequency of the * supplied wurt s. */ int wurtFreq1(char *s, Node *cur, int cs) { int cmp; if(!cur) return 0; /* Node exists, compare key value */ cmp = (cs?strcmp:strcasecmp)(s, cur->key); if(cmp < 0) /* Key is greater than wurt, go left */ return wurtFreq1(s, cur->left, cs); else if(cmp > 0) /* Key is less than wurt, go right */ return wurtFreq1(s, cur->right, cs); /* Found the wurt. */ return cur->count; } /* wurtsFreq * Returns the frequency of the provided wurt in a wurts data structure. */ int wurtFreq(char *wurt, WurtsDS *w) { /* Call helper function */ return wurtFreq1(wurt, w->root, w->cs); } /* wurtProb * Returns the probability of a particular wurt in a wurts data structure. */ double wurtProb(char *wurt, WurtsDS *w) { /* Call helper function. *1.0 to use floating-point maths. */ return 1.0 * wurtFreq1(wurt, w->root, w->cs) / w->wurtCount; } /* incWurtsCount1 * Helper function for incWurtsCount: binary search tree insertion/update to * increment the frequency count of the provided wurt. The parameter cs * indicates case sensitivity. Returns the current node or NULL on failure. * Note that if something goes wrong, the operation is aborted and the tree * is unrecoverably destroyed; memory is not freed. */ Node *incWurtsCount1(char *wurt, Node *cur, int cs) { int cmp; if(cur) { /* Node exists, compare key value */ cmp = (cs?strcmp:strcasecmp)(wurt, cur->key); if(cmp < 0) { /* Key is greater than wurt, go left */ cur->left = incWurtsCount1(wurt, cur->left, cs); } else if(cmp > 0) { /* Key is less than wurt, go right */ cur->right = incWurtsCount1(wurt, cur->right, cs); } else { /* Found the appropriate key. Increment count. */ cur->count++; } } else { /* Empty subtree found, insert wurt as new node. */ cur = (Node *)malloc(sizeof(Node)); if(!cur) return NULL; /* Memory allocation failure */ cur->key = strdup(wurt); if(!cur->key) { /* Memory allocation failure, clean up and abort */ free(cur); return NULL; } /* No children */ cur->left = NULL; cur->right = NULL; /* Single occurrence */ cur->count = 1; } /* Pass node up the tree */ return cur; } /* incWurtsCount * Increments the frequency count of the provided wurt. Returns 1 if * successful, 0 otherwise. */ int incWurtsCount(char *wurt, WurtsDS *w) { /* Call helper function */ w->root = incWurtsCount1(wurt, w->root, w->cs); if(w->root) /* Added successfully; increment total */ w->wurtCount++; return w->root != NULL; } /* initWurtsDS * Sets up the data structure used to store the wurts. Returns 1 if * successful, 0 otherwise. */ int initWurtsDS(WurtsDS *w, int caseSensitive) { /* No root node */ w->root = NULL; /* Remember case sensitivity */ w->cs = caseSensitive; /* No entries yet */ w->wurtCount = 0; return 1; } /* destroyWurtsDS * Cleans up the data structure used to store the wurts. Returns 1 if * successful, 0 otherwise. */ int destroyWurtsDS(WurtsDS *w) { /* Destroy tree and its data */ killTree(w->root); return 1; } /* resetWurtsDS * Reinitialises the data structure used to store the wurts. Maintains case * sensitivity setting. Does not cause any memory allocation failures. */ int resetWurtsDS(WurtsDS *w) { /* Destroy tree and its data */ killTree(w->root); w->root = NULL; w->wurtCount = 0; return 1; } /* matrix * Allocates space for an arbitrary two-dimensional array. * Each element is initialised to 0. * Returns NULL on failure. * Memory must later be freed using freeMatrix. */ void **matrix(size_t size, int rows, int cols) { void **t; int i; /* Require positive input */ assert(size > 0 && rows > 0 && cols > 0); /* Allocate first dimension (rows) */ if(!(t = calloc(sizeof(void *), rows))) /* Handle failure */ return NULL; /* Allocate each row */ for(i=0; i 127) ; /* eat irrelevant characters */ /* c contains the first character of our wurt. */ if (isdigit(c)) state = 1; /* number */ else if(isalpha(c)) state = 2; /* identifier */ else state = 3; /* symbol */ while(c != EOF) { if((state == 1 && !isdigit(c)) || (state == 2 && !isalnum(c)) || (state == 3 && (!ispunct(c) || c > 127))) /* Found unwanted character, finished reading. */ break; t[i++] = c; /* Add current character to string */ if(i % INC == 0) { /* Out of space, expand string */ t = (char *)realloc(t, sizeof(char) * (i + INC)); if(!t) /* Out of memory */ return NULL; } /* Get next character */ c = getc(source); } /* Pretend we didn't just read the next character */ ungetc(c, source); if(!i) { /* String length is zero, EOF. Clean up. */ free(t); return NULL; } /* Null-terminate and return string. */ t[i] = 0; return t; } /* outputAdjacencyMatrix * Prints to stdout the adjacency matrix adj using reasonably well-formatted * output. size is the length of both dimensions of the matrix and the * number of labels. */ void outputAdjacencyMatrix(double **adj, int size, char **labels) { int i, j, p, maxlen=7; /* Calculate max label length */ for(i=0; i 1) { /* Worth compressing. */ if(mode == 1) /* Last output was an inline wurt; * requires end of string byte */ printf("\033[1;30m\033[0m"); /* Print codeword length */ printf("[%.2g]", -log2(wurtProb(s, w))); /* Remember mode */ mode = 0; } else { /* Not worth compressing, print inline. */ if(mode != -1) /* Not first output; requires end of string * byte */ printf("\033[1;30m\033[0m"); /* Print wurt */ printf("\033[1;37m%s\033[0m", s); /* Remember mode */ mode = 1; } free(s); } printf("\n\n"); } /* bodySize * Returns the theoretical compressed size of the input file(s), open for * reading, based on the wurts data structure w. If only one file is required * in2 must be set to NULL. */ int bodySize(WurtsDS *w, FILE *in1, FILE *in2) { int c = 0; /* compressed length */ double part = 0; /* codeword run partial length */ char *s; /* current wurt */ /* in1 must not be NULL */ assert(in1); /* For the first file: */ while((s = getNextWurt(in1))) { /* Check wurt's frequency */ if(wurtFreq(s, w) > 1) { /* Worth compressing. */ /* Remember codeword length in bits */ part -= log2(wurtProb(s, w)); } else { /* Not worth compressing, print inline. */ if(part > 0) /* Preceded by codewords, add length * in bytes */ c += ceil(part/8) + 1; /* Add wurt length and end of string byte */ c += strlen(s) + 1; /* Clear cumulated codeword lengths */ part = 0; } free(s); } if(in2) { /* For the second file: */ while((s = getNextWurt(in2))) { /* Repeat all of the above */ if(wurtFreq(s, w) > 1) { part -= log2(wurtProb(s, w)); } else { if(part > 0) c += ceil(part/8) + 1; c += strlen(s) + 1; part = 0; } free(s); } } if(part > 0) /* Last wurt was encoded; add run length */ c += ceil(part/8); else if(c>0) /* Last was inline; remove redundant end of string byte */ c--; /* Return calculated compressed length */ return c; } /* headerSize1 * Helper function for headerSize. Returns the size of header contributed by * a subtree of the wurts data structure, calculated recursively. */ int headerSize1(Node *cur) { int val = 0, t; if(!cur) /* Node does not exist, no contribution */ return 0; if(cur->count > 1) { /* This node has a frequency higher than 1; the key is worth * compressing */ val = strlen(cur->key) + 1; /* Plus 1 for end of string */ if(isdigit(*(cur->key))) /* Number wurts require a separator to disambiguate * them from the frequency count */ val++; t = cur->count; /* Determine and add the number of digits in the wurt's * frequency count */ while(t>0) { val++; t /= 10; } } /* Add current node to header size of subtrees and return */ return val + headerSize1(cur->left) + headerSize1(cur->right); } /* headerSize * Returns the size of a header for texts represented by the frequencies of * wurts in the supplied data structure. */ int headerSize(WurtsDS *w) { return headerSize1(w->root) + 1; } /* fillMatrix * Returns the value to be inserted at position [i][j] in the adjacency * matrix. Requires the file in is open for reading, to be concatenated * with filenames[j] for compression. Also populates len[i] and compressed[i] * as appropriate. * Note: this function is designed for convenience, not reuse. */ double fillMatrix(int i, int j, FILE *in, char *filenames[], WurtsDS *wp, int *len, int *compressed, int verbose) { FILE *in2; char *s; int l, t, val; if(!*filenames[j]) { /* Filename is empty (indicative of * read error). Skip. */ return EMPTY; } if(!(in2 = fopen(filenames[j], "r"))) { /* Can't open file. Skip. */ return EMPTY; } /* Clear structure containing wurt frequencies */ resetWurtsDS(wp); /* Go back to start of file 1 */ rewind(in); l = 0; /* Clear length counter */ /* Repopulate data structure with file 1 */ while((s = getNextWurt(in))) { /* Remember and free current wurt */ if(!incWurtsCount(s, wp)) /* Out of memory */ return 0; l += strlen(s); /* Add to length */ free(s); } if(!compressed[i]) { /* Have not already stored compressed length of current file; * calculate and remember. */ /* Get file length */ len[i] = l; /* Get header size */ t = headerSize(wp); /* Compressed length is header size plus body size */ rewind(in); compressed[i] = t + bodySize(wp, in, NULL); if(verbose) { /* Print useful file info */ printf("'%s': length %d, compressed %d (%d+%d)\n", filenames[i], len[i], compressed[i], t, compressed[i]-t); if(len[i] < MAX_REP_LEN) { /* Short file, output compressed body * representation */ rewind(in); bodyRepresentation(wp, in); } } } /* Add contents of file 2 to the wurts data structure */ while((s = getNextWurt(in2))) { /* Remember and free current wurt */ if(!incWurtsCount(s, wp)) /* Out of memory */ return 0; free(s); } /* Back to the start of both files */ rewind(in); rewind(in2); /* Calculate full compressed length of concatenated files */ val = headerSize(wp) + bodySize(wp, in, in2); /* Finished with file 2 */ fclose(in2); /* Return matrix value */ return val; } /* getAdjacencyMatrix * Determines and outputs the adjacency matrix of source text distances by * determining the normalised theoretical compressed length of texts using * a sub-optimal encoding technique. Compares all files against each other. * File errors are non-fatal. * cs represents required case sensitivity (nonzero = case sensitive). * verbose indicates output level (0 = normal, 1 = verbose). * Returns 0 on success, 1 on allocation failure. */ int getAdjacencyMatrix(double **adj, int cs, int numfiles, char *filenames[], int verbose) { WurtsDS wurt; FILE *in = NULL; int i, j; int *len = NULL, *compressed = NULL; assert(numfiles > 0); /* Set up */ /* Space to store file lengths */ len = (int *)calloc(sizeof(int), numfiles); if(!len) /* Couldn't allocate space for lengths */ return 0; /* failure */ /* Space to store file lengths */ compressed = (int *)calloc(sizeof(int), numfiles); if(!compressed) { /* Couldn't allocate space for compressed lengths */ free(len); return 0; /* failure */ } if(!initWurtsDS(&wurt, cs)) { /* Couldn't allocate space for data structure, clean up */ free(compressed); free(len); return 0; /* failure */ } /* For each file as a model: */ for(i=0; i 0) { /* For non-empty values, calculate normalised * compression ratio. */ adj[i][j] = (adj[i][j] - compressed[i]) / len[j]; } else if(adj[i][j] != EMPTY) { /* Zero-length file. */ adj[i][j] = 1e99999; /* really really big */ } /* Clean up */ free(compressed); free(len); destroyWurtsDS(&wurt); return 1; } /* main * Error-checks parameters and handles error messages. */ int main(int argc, char *argv[]) { int cs = 1, v = 0; double **adj; char *name = *argv; /* Skip program name. */ argv++; argc--; /* Check for command-line options. */ if(argc && !strcmp(*argv, "-v")) { v = 1; /* verbose */ argv++; argc--; } if(argc) { if(!strcmp(*argv, "-c")) { cs = 0; /* insensitive */ argv++; argc--; } else if(!strcmp(*argv, "-C")) { argv++; argc--; /* default */ } else if(!strcmp(*argv, "-v")) { v = 1; /* verbose */ argv++; argc--; } } /* Check for filenames. */ if(!argc) { /* Not enough; output a usage message and quit. */ fprintf(stderr, "usage: %s [-v] [-c|-C] file1 file2 ...\n", name); return 1; } /* Set up adjacency matrix */ adj = (double **)matrix(sizeof(double), argc, argc); if(!adj) { /* Couldn't allocate space for adjacency matrix */ fprintf(stderr, "error: could not allocate memory\n"); return 4; } /* Call the getAdjacencyMatrix function to do all the dirty work. */ if(!getAdjacencyMatrix(adj, cs, argc, argv, v)) { /* Something went wrong allocating memory; quit. */ fprintf(stderr, "error: could not allocate memory\n"); } outputAdjacencyMatrix(adj, argc, argv); freeMatrix((void **)adj, argc); /* Finished. */ return 0; }