#!/usr/local/bin/python # Helper Functions for the Statistical package # (C) Daniel Heath, 2006 # Available for use under the GPL import sys, string, math, operator def help(): print "Welcome to Help\n" print "Functions included in this distribution:\n" print "isNumeric(val) -> bool (Determines whether the argument is numeric)\n" print "mean(list) -> float (Takes the mean of all numeric items in the list. Ignores non-numeric items)\n" print "median(list) -> float (Finds the median of all numeric items in the list. Ignores non-numeric items.)\n" print "histDict(list) -> dict (Returns a dictionary object where the keys are the elements in the list and the values are the number of times that element appears)\n" print "mode(list) -> float (Finds the mode of all numeric items in the list. Ignores non-numeric items.)\n" print "variance(list) -> float (Finds the variance of a list, ignoring non-numeric items.)\n" print "oneStrSubst(List, string, value) -> List (replaces string with value in list)\n" print "allStrSubst(List, value) -> List (replaces all strings with given value in list)\n" print "zeroSubst(List) -> List (replaces all strings in list with 0)\n" print "meanSubst(List) -> List (replaces all strings in list with mean of numeric values in list)\n" print "stddev(List) -> float (Finds the standard deviation of the list, ignoring non-numeric items.)\n" print "pearsonCorrelation(List1, List2) -> float (Finds the correlation between two lists)\n" print "pairwiseDelete(List1, List2) -> Tuple of (List, List)" def isNumeric( val ): b = 1 try: float(val) except: b = 0 return b def mean(L2): L = filter(isNumeric, L2) if L == []: return 0.0 return float(float( sum(L))/float(len(L))) def median(L2): L = filter(isNumeric, L2) a = len(L) L.sort() if (a % 2) == 1: return float(L[a / 2]) else: return mean(L[((a / 2) - 1):((a / 2)+1)]) def histDict(L): if (len(L) == 0): return {} d = histDict(L[1:]) if L[0] in d: d[L[0]] += 1 else: d[L[0]] = 1 return d def mode(L2): L = filter(isNumeric, L2) a = histDict(L) keys = a.keys() vals = a.values() numFound = 1 sumOfFound = 0 foundMax = 0 for key in keys: if (a[key] > foundMax): numFound = 1 sumOfFound = key foundMax = a[key] elif (a[key] == foundMax): numFound += 1 sumOfFound += key return (float(sumOfFound) / numFound) def variance(L2): def __variance__(L, m): if (len(L) == 0): return 0 else: return ((L[0] - m)**2 + __variance__(L[1:], m)) L = filter(isNumeric, L2) if (len(L) > 1): return (__variance__(L, mean(L)) / (len(L))) else: return 0 def oneStrSubst(L, str, val): if (len(L) == 0): return [] L2 = oneStrSubst(L[1:], str, val) if (L[0] == str): L2.insert(0, val) else: L2.insert(0, L[0]) return L2 def allStrSubst(L, val): if (len(L) == 0): return [] L2 = allStrSubst(L[1:], val) if isNumeric(L[0]): L2.insert(0, L[0]) else: L2.insert(0, 0) return L2 def zeroSubst(L): return allStrSubst(L, 0) def meanSubst(L): m = mean(L) return allStrSubst(L, m) def stddev(L): return math.sqrt(variance(L)) def PearsonCorrelation(X2, Y2): if (len(X2) != len(Y2)): return "Different Lengths" try: n = float(len(X2)) X = map(lambda x: (float(x)), X2) Y = map(lambda x: (float(x)), Y2) sum_sq_x = 0.0 sum_sq_y = 0.0 sum_coproduct = 0.0 mean_x = X[0] mean_y = Y[0] sweep = 0.0 delta_x = 0.0 delta_y = 0.0 pop_sd_x = 0.0 pop_sd_y = 0.0 cov_x_y = 0.0 for i in range(1,int(n)): sweep = float(i) / float(i+1) delta_x = X[i] - mean_x delta_y = Y[i] - mean_y sum_sq_x += delta_x * delta_x * sweep sum_sq_y += delta_y * delta_y * sweep sum_coproduct += delta_x * delta_y * sweep mean_x += float(delta_x) / float(i + 1) mean_y += float(delta_y) / float(i + 1) pop_sd_x = math.sqrt( float(sum_sq_x) / n ) pop_sd_y = math.sqrt( float(sum_sq_y) / n ) cov_x_y = float(sum_coproduct) / n return (float(cov_x_y) / float(pop_sd_x * pop_sd_y)) except: return "Non-Numeric Data" def quantize(A, step): C = A[:] C.sort() B = [] for num in C: if ((num % step) >= step / 2.0): num = num + step - (num % step) else: num = num - (num % step) B.append(num) return B #def pairwiseDelete(A, B): # Make a new list where all records containing non-numeric data are removed. # if ((len(A) == 0) or (len(B) == 0)): # return ([], []) # C, D = pairwiseDelete(A[1:], B[1:]) # if isNumeric(A[0]): # if isNumeric(B[0]): # C.insert(0, A[0]) # D.insert(0, B[0]) # return C, D def pairwiseDelete(A): for item in A: if (len(item) == 0): L = [] for item2 in A: L.append([]) return L B = [] for item in A: B.append(item[1:]) C = pairwiseDelete(B) flag = 0 for item in A: if (not(isNumeric(item[0]))): flag=1 if (flag == 0): for i in range(0, len(A)): C[i].insert(0, A[i][0]) return C def pearsonMatrix(A): print "\n\t" for i in range(1, len(A)+1): print i, "\t", for i in range(1, len(A)+1): print "" print i, for j in range(0, i): print "\t", for j in range(i, len(A)): tmp = PearsonCorrelation(allStrSubst(A[i], 0), allStrSubst(A[j], 0)) print str(tmp)[:4], "\t",