module GetTypes where import StringUtil import Data.List import Data.Maybe import Data.Char recordSeperator = "\n" fieldSeperator = "," listOpen = '[' listClose = ']' missingVals = ["#Error"] {- - listCols: - - Takes the file contents and returns a list of booleans corresponding to the columns which contain lists. - So if the data-set columns were in the format - Scalar | List | List | Scalar | List - then listCols would - return [False, True, True, False, True]. -} listCols cont = let lines = tokenise recordSeperator cont fRow = recombineList (tokenise fieldSeperator (lines!!0)) fHead l = if (l /= []) then (head l) else 'a' lInfo = map (== listOpen) (map fHead fRow) in lInfo {- - getColumns: - - Takes the file contents and extracts a list of lists containing the contents of each column. If given - the input: "1,Male\n2,Female" and the record seperator and field seperator were "\n" and "," respectively - getColumns would return [["1","2"],["Male","Female"]] -} getColumns cont = let lines = tokenise recordSeperator cont values = map recombineList (map (tokenise fieldSeperator) lines) columns = [ remMissing (getColumn i values) | i <- [0..((length (head values)) - 1)] ] in columns {- - getColumn: - - Takes a list of lists of rows and returns column colNum (indexed from zero). For example, with the input: - 1 [["1","Male"],["2","Female"]] the returned value would be ["Male","Female"]. -} getColumn colNum [] = [] getColumn colNum (x:xs) = let colData = if (colNum < (length x)) then (x!!colNum) else [] -- Checked to avoid problems if the last column is empty. colEntries = if (colData /= []) then (if (head colData == listOpen) then (tokenise "," (delete listClose (tail colData))) else []) else [] in if (colEntries == []) then if (colData /= [listOpen, listClose]) then colData:(getColumn colNum xs) else getColumn colNum xs else colEntries++(getColumn colNum xs) {- - recombineList: - - Puts lists which have been split up by the tokeniser back together. If the list open and list close - characters were respectively '[' and ']' and the argument was ["nl1","[pl1","pl2", "pl3]","nl2"] then - the return value would be ["nl1","[pl1,pl2,pl3]","nl2"] -} recombineList x = doRecombineList x [] False {- - doRecombineList: - - Performs the list recombination operation. -} doRecombineList [] _ _ = [] doRecombineList (x:xs) current open = if ((length x) > 0) then if ((not open) && ((head x) == listOpen)) then if ((x!!((length x) - 1)) == listClose) then x:(doRecombineList xs [] False) else doRecombineList xs (listOpen:(tail x)) True else if (open) then if ((x!!((length x) - 1)) == listClose) then (current++","++(delete listClose x)++[listClose]):(doRecombineList xs [] False) else doRecombineList xs (current++","++x) True else x:(doRecombineList xs [] False) else []:(doRecombineList xs [] False) {- - addListBrackets: - - Takes the type representation list and a list of booleans indicating whether or not to add list brackets - to the corresponding elements and returns a type representation list with list brackets added to it. - For example, with the arguments: [("String", []), ("T0", ["Male", "Female"])] [False, True] the returned value - would be: [("String", []), ("[T0]", ["Male", "Female"])]. -} addListBrackets [] [] = [] addListBrackets (x:xs) (y:ys) = if (y) then ((listOpen:[])++(fst x)++(listClose:[]), snd x):(addListBrackets xs ys) else x:(addListBrackets xs ys) {- - getTypes: - - Performs the operations to make the type representation list from the file contents. -} getTypes cont = addListBrackets (groupTypes (genNewTypes (getColumns cont) (getBaseTypes cont) 0 []) []) (listCols cont) {- - genNewTypes: - - Creates a name and finds the type constructors for each new variable type. If a column is found to - contain constructors which match those found in a previous column then it makes the new column that type - and adds the new constructors to the constructor list for that column. -} genNewTypes [] [] generated allCats = [] genNewTypes (x:xs) (y:ys) generated allCats = let cats = getCategories x y catsAndType = map (\c -> (c, "T"++(show generated))) cats exCat = if (cats /= []) then (filter (snd) (map (getTypeName allCats) cats)) else [] -- If a categorical type shares categories with this type then get its name. tName = if (exCat /= []) then (fst (exCat!!0)) else ("T"++(show generated)) -- If a variable type for these categories already exists then use it, otherwise generate a new type name. next = if (exCat /= []) then generated else (generated + 1) -- If a new variable type is being created then increment next so the next type generated has a higher number. in if (cats /= []) then (tName, cats):(genNewTypes xs ys next (allCats++catsAndType)) else (y, []):(genNewTypes xs ys generated allCats) {- - getTypeName: - - Takes a list of existing type constructors and the name of their type and and the name of a new constructor. - If the new constructor already exists then return a double containing the name of the type it belongs to - and True. Otherwise return an empty string and False. -} getTypeName exCats cat = let cl = map (== cat) (map fst exCats) res = findIndex (== True) cl in if (res == Nothing) then ("", False) -- Tuple contains False if there's no existing name to use. else (snd (exCats!!(fromJust res)), True) -- Tuple contains True if an existing name is used. {- - getBaseType: - - Takes the file contents and returns a list containing the "base type" of each of the columns. -} getBaseTypes cont = map getColType (getColumns cont) {- - getColType: - - Takes a list of each field in a column and returns the "base type" of that column. For example, - with the input ["4.3", "5", "64.2"] the return value would be "Float". -} getColType [x] = getFieldType x getColType (x:xs) = doGetFieldType x (getColType xs) {- - getCategories: - - Takes a list of the fields in a column and the base type of that column. If the base type is - string and the fields seem to be the same things repeating over and over then return a list - of the unique entries, guessing they're categories of a new type. For example, with the - input: ["No", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "No"] "String" the return - value would be ["No", "Yes"]. For the input ["erte", "sdg", "sdfgs", "wawff3"] "String" the - the return value is []. -} getCategories colData colType = let upperFirst c = if (c /= []) then (toUpper (head c)):(tail c) else [] stripSpace c = filter (/= ' ') c upperFields = map stripSpace (map upperFirst colData) uniqueFields = if (colType == "String") then (unique upperFields) else [] in if ((uniqueFields /= []) && ((div (length colData) (length uniqueFields)) >= 5)) then filter (/= []) (map (foldl (++) "") (map (map nonAlphaNumericSub) uniqueFields)) else [] {- - remMissing: - - Takes a list and returns a new list with all elements which are in the missingVals list removed. -} remMissing [] = [] remMissing (x:xs) = if (isJust (find (== x) missingVals)) then remMissing xs else x:(remMissing xs) {- - groupTypes: - - Takes a type representation list and a list to represent which types have been done and groups - together type constructors of types with the same name in different columns. For example, with - the input [("T0", ["Good","Bad"]), ("T0", ["Medium"]), ("String", [])] the return value would be - [("T0", ["Good", "Bad", "Medium"]), ("T0", []), ("String", [])]. -} groupTypes [] _ = [] groupTypes (x:xs) doneList = let likeIndices = findIndices (== (fst x)) (map fst xs) le = (snd x)++(groupLikeElements likeIndices (map snd xs)) in if (((find (== (fst x)) doneList) == Nothing) && ((snd x) /= [])) then ((fst x), (unique le)):(groupTypes xs ((fst x):doneList)) else ((fst x), []):(groupTypes xs doneList) {- - groupLikeElements: - - Takes a list of indexes of constructor lists to group together and a list of lists of constructors - and returns a list of grouped constructors. For example, with the input: - [0, 2] [["Good", "Bad"], ["Female"], ["Medium"]] the return value is ["Good", "Bad", "Medium"]. -} groupLikeElements [] _ = [] groupLikeElements (x:xs) y = (y!!x)++(groupLikeElements xs y) {- - unique: - - Takes a list and returns the list of unique elements. unique [1,2,3,2,1] returns [1,2,3]. -} unique x = doUnique (sort x) {- - doUnique: - - Performs the operations for the creation of a list of unique elements. -} doUnique [x] = [x] doUnique (x:y:zs) = if (x /= y) then x:(doUnique (y:zs)) else doUnique (y:zs)