BPstats.py

From OpenWetWare
Jump to navigationJump to search
#!/usr/bin/env python

#############################################################################################################################
#BPstats.py [options] <basepile output>                                                                                     #
#Example: python BPstats.py -p -h myBasePile.txt                                                                            #
#Version: 1.2   Last Modified: 12/21/2009                                                                                   #
#---------------------------------------------------------------------------------------------------------------------------#
#>Ouptuts a txt file containing statistics in the following format:                                                         #
#   GSS basepile #\tGSS ref match length\ttarget match sum\tcoverage proportion\taverage density of target\tmedian densitiy #
#>Assumes each contig is 1000 bases long                                                                                    #
#---------------------------------------------------------------------------------------------------------------------------#
#Modifiers:                                                                                                                 #
#   -d      : Save debug log                                                                                                #
#   -h      : Do not print header on first line                                                                             #
#   -p      : Print output to standard out (usually the shell being called from)                                            #
#   -r      : Make output more readable by adding extra tabs to the statistic columbs (aligns columbs to header)            #
#############################################################################################################################

###Imports / Variable Initilization##########################################################################################
import os, string, sys
defArgList  = ['BPstats.py','-H','C:/Python26/testData.txt'] #argument list used during script testing the with Python GUI IDLE  
argList     = sys.argv #argument list supplied by user
argNum      = len(argList) #the number of arguments supplied
minArgNum   = 1 #the smallest amount of arguments with which it is possible to run the script
addHead     = True #is True if the headers are to be printed on the first line
saveDubug   = False #is True if the debug is to be saved
warning     = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug law
readable    = False #is 'True' if the -r modifier is supplied; makes the output easier to read
debugLog    = ['*********DEBUG LOG*********\n'] #where all errors/anamolies are recorded; saved if the -d modifier is supplied  
savePath    = './' #the directory in which the output will be saved
defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE 
header      = 'Contig #\tRef length\tMatch sum\tCoverage %\tAve density \tMed density\n' #the header which is printed on the first line of the output
inData      = [] #a list of lists contiaing the data from the input file [[ref,density,actual],...]
parsedData  = [] #a list containing data for each 1000 bp chunk [[GSS ref match length...see above]...]
outData     = [] #a list of strings corresponding to each line in the output file
modifiers   = [] #eventually contains a list of all modifiers and their arguments supplied by the user
allMods     = 'hpr' #all the modifiers reconized by the script 
activeMods  = '' #a string contiaing all mods specified by the user 
modArgs     = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers 
#############################################################################################################################

def printHelp():
    print '/---------------------------------------------------------------------------------------\'
    print '| BPstats.py [options] <basepile output>                                                |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Example: python BPstats.py -p -h myBasePile.txt                                       |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Version: 1.2   Last Edited: 12/22/2009                                                |'
    print '|\t>Ouptuts a txt file containing statistics in the following format:                   |'
    print '|\t\tGSS basepile #, GSS ref match length, target match sum, coverage proportion,       |'
    print '|\t\taverage density of target, median densitiy                                         |'
    print '|\t>Assumes each contig is 1000 bases long                                              |'
    print '|---------------------------------------------------------------------------------------|'                       
    print '| Modifiers:                                                                            |'
    print '|\t-d\t: Save debug log                                                                 |'
    print '|\t-h\t: Do not print header on first line                                              |'
    print '|\t-p\t: Print output to standard out (usually the shell being called from)             |'
    print '|\t-r\t: Make output more readable by adding extra tabs to the output columbs           |'
    print '\---------------------------------------------------------------------------------------/'

#Error handling function#####################################################################################################
#>Is called when the script encounters a fatal error                                                                        #
#>Prints the debug log to standard out (usually the screen)                                                                 #
def errorExit():
    print 'The program was forced to exit prematurly, printing debug log...\n'
    for line in debugLog:
        print line
    sys.exit()
#############################################################################################################################

###Argument Interprtation####################################################################################################
#>Parses the arguments supplied by the user, or the default values if the script is being run on IDLE                       #
#>If no arguments are given, the help menu is printed                                                                       #
#>Modifiers and their arguments are isolated from the raw input for later processing (in Modifier Interpretation)           #
if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...
    if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied...
        if os.path.exists(argList[-1]) == 0: #if the path dose not exist
            debugLog.append('Error: Invalid file path to input data')
            errorExit() #end the program 
    elif argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu 
    else:
        debugLog.append('Error: Too few arguments supplied\n')
        errorExit() 
else: #If the script is being imported on to IDLE 
    argList  = defArgList #use default arguments 
    argNum   = len(defArgList)
    if argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu
    savePath = defSavePath #sets the save path to the default, specified in the variable initialization section 
    debugLog.append('Alert: defualt debugging input arguments are being used\n')
if helpOnly == False: #if arguments were supplied...
    faPath = argList[-1] #the path to the fasta file containing the input data 
    if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments)
        modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their arguments
#############################################################################################################################


if helpOnly == False:
    ###Modifier Interpretation###############################################################################################
    #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation"     #
    #>Given arguments are compared against a list of known arguments                                                        #
    #>Matches found change the appropriate variable for the desired effect of the modifier the script                       #
    if len(modifiers) > 0: #if modifiers are supplied
        for mod in modifiers: #loops through the list of modifiers and modifier arguments 
            if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters        
                activeMods += mod[1:] #sorts the modifiers into activeMods...
            else:  
                modArgs.append(mod) #assumes everything else to be a modifier argument 
        for letter in activeMods: #loops through supplied modifiers    
            if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script 
                debugLog.append('Warning: Unexpected modifier: ' + letter + '\n')
                warning = True #if the input modifier is not found
            else: #if it is a recognized modifier...
                if letter == 'd': #if -d is supplied...
                    saveDebug = True #The debug log will be saved to the current working directory 
                if letter == 'h': #if -h is supplied...
                    printHead = False #header will not be included in the output  
                if letter == 'p': #if -p is supplied...
                    printOut = True #The results will be printed to the standard output (usually the screen)
                if letter == 'r': #if -r is supplied...
                    readable = True #columbs are spaced in a more readable way
    #########################################################################################################################

    ###Input data parseing and file procedures###############################################################################
    inHandle = open(inPath, 'r') #creates a file object
    inHandle.readline() #moves position past the header on the first line
    for line in inHandle: #loops through the lines of the input file
        lineParts = string.split(line) #produces a list of all the contents of each line (separated by spaces/abs)
        reference = lineParts[1]
        actual    = lineParts[6]
        density   = 0 
        for index in range(2,6): #adds the middle 4 values together to obtain density 
            density += int(lineParts[index])
        inData.append([reference,density,actual]) #adds parsed line to the list that will be analysed 
    inHandle.close() #closes the file object
    #outputs [[ref,density,actual],...]
    #########################################################################################################################

    ###Statistical analysis##################################################################################################
    contigNum = len(inData)/1000 #The number of contigs the base pile output represents 
    for count in range(0,contigNum): #do once for every contig 
        contig = inData[count * 1000:(count + 1) * 1000] #isolates the parsed input data specific to that contig
        refMatchLen = 0 #records the index of the last known base on the reference  
        tarMatchSum = 0 #counts how many known (not N) bases match the target
        medDensity  = 0 #median density value for the contig
        index  = 0
        denList = [] #a list of all the densities for this contig; used to get median and average density
        for bp in contig: #for every base in the contig...
            denList.append(bp[1]) #record its density value
            if bp[0] != 'N': #if the actual value is known...
                refMatchLen = index #the length of the viable portion of the reference that is known  
                if bp[2] != 'N' and bp[0] == bp[2]: #if the actual value is known and equals the reference... 
                    tarMatchSum += 1 #increment the amount of matching bases 
            index += 1 #incremented to match the index of the current base pair 
        denList.sort() #allows for the median calculation 
        coveragePro = float(tarMatchSum) / float(refMatchLen) #percentage of the actual that matchs to the reference 
        aveDensity  = float(sum(denList)) / float(refMatchLen) #average density values within the known portion of the reference 
        medDensity  = float(denList[499] + denList[500]) / 2 #median density of the entire range of density values 
        contigStats = []
        contigStats.append(count + 1)       #GSS basepile #
        contigStats.append(refMatchLen)
        contigStats.append(tarMatchSum)
        contigStats.append(coveragePro)
        contigStats.append(aveDensity)
        contigStats.append(medDensity)
        parsedData.append(contigStats)
    #outputs: [[GSS basepile #,GSS ref match length,target match sum,coverage proportion,average density of target,median densitiy]...]
    #########################################################################################################################

    def round(dec): #basic rounding function 
        outDec = dec
        decPos = string.find(dec,'.')
        if decPos != -1 and len(dec) > (decPos + 3):
            outDec = outDec[:decPos + 3]
        return outDec

    ###Data-to-Text Formating################################################################################################
    for contig in parsedData: #for every contig
        contigText = ''
        for stat in contig: #for every statistic in each contig
            statText = round(str(stat)) #round to three significant figures 
            contigText += statText + '\t' #add the statistic to the output for this contig
            if readable: #if the -r modifier is supplied 
                contigText += '\t' #add an extra tab to make the columbs align to the header
        contigText = contigText[:-1] #removes last \t
        if readable: #if the -r modifier is supplied
                contigText = contigText[:-1] #remove the extra tab
        contigText += '\n' #add an endline after the statistics of each contig 
        outData.append(contigText) #add the contig to the output data 
    #########################################################################################################################

    ###Out file writing and saveing procedures###############################################################################
    savePath += os.path.basename(inPath)[:-4] + '_stats.txt' #the path to where the output is saved
    outHandle = open(savePath, 'w') #opens the file object for saving the output
    for line in outData: #prints every line of outData to the output file...
        outHandle.write(line)
    outHandle.close()
    if printOut: #if the -p modifier is supplied
        for line in outData: #prints every line to the standard output...
            print line[:-1]
    #########################################################################################################################