Qualtofa.py: Difference between revisions

From OpenWetWare
Jump to navigationJump to search
mNo edit summary
mNo edit summary
Line 1: Line 1:
Copy and past the following into a text editor and save it as sumqual.py to use the script.
<pre>
<pre>
#!/usr/bin/env python
#!/usr/bin/env python

Revision as of 03:13, 13 January 2010

Copy and past the following into a text editor and save it as sumqual.py to use the script.

#!/usr/bin/env python
 
#############################################################################################################################
#qualtofa [options] <quality file path>                                                                                     #
#Example: python qualtofa.py -n 20 ../myValues.qual                                                                         #
#Version: 1.3   Last Modified: 12/30/09     Author: Zachary S. L. Foster                                                    #
#---------------------------------------------------------------------------------------------------------------------------#
#>Argument: path to a yasra quality value file (.qual) containing values for one or more sequences                          # 
#---------------------------------------------------------------------------------------------------------------------------#    
#>Selectively extracts the sequence from a consensus quality file and outputs it to a fasta file (.fa)                      #
#>Conflicting bases are condensed into IUPAC ambiguity codes                                                                #
#---------------------------------------------------------------------------------------------------------------------------#
#Modifiers:                                                                                                                 #
#   -d      : save debug log; specify where to save the debug log       (default = current working directory)               #
#   -e      : save log of edits made to the contig                      (default = print to screen)                         #
#   -p      : print output                                                                                                  #
#   -n      : replace all bases that have a total quality value score less than the specified amount                        #
############################################################################################################################# 


###Imports / Variable Initilization##########################################################################################
import os, string, sys, time
defArgList  = ['qualtofa.py','-n','30','C:/Python26/balf.mummeraligned.qual'] #argument list used during script testing the with Python GUI IDLE
argList     = sys.argv #argument list supplied by user
argNum      = len(argList) #the number of arguments supplied
minArgNum   = 1 #the smallest amount of arguments with which it is possible to run the script
saveDebug   = False #is True if the debug is to be saved
warning     = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug log
printOut    = False #is True when no the -p modifier is supplied; #The results will be printed to the standard output (usually the screen)
helpOnly    = False #is 'True' when no arguments are given; only help menu is printed
maskingChar = 'N' #What represents a "unknown" base
minQualScore= 0 #determined by the presence of the -n modifier
savePath    = './' #the directory in which the output will be saved
defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE
modifiers   = [] #eventually contains a list of all modifiers and their arguments supplied by the user
allMods     = 'depn' #all the modifiers recognized by the script 
activeMods  = '' #a string containing all modifiers specified by the user 
modArgs     = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers
qualData    = []
outData     = [] #eventually holds the data that will be saved in the output file 
debugLog    = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied
#############################################################################################################################

#Error handling function#####################################################################################################
#>Is called when the script encounters a fatal error                                                                        #
#>Prints the debug log to standard out (usually the screen)                                                                 #
def errorExit():
    print 'The program was forced to exit prematurely, printing debug log...\n'
    for line in debugLog:
        print line
    sys.exit()
#############################################################################################################################

def printHelp():
    print '/---------------------------------------------------------------------------------------\\'
    print '| qualtofa [options] <yasra quality file path>                                          |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Example: python qualtofa.py -n 20 ../myValues.qual                                    |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Version: 1.3   Last Modified: 12/30/09     Author: Zachary S. L. Foster               |'
    print '|  >Selectively extracts the sequence from a consensus quality file and outputs it to a |'
    print '|    fasta file (.fa)                                                                   |'
    print '|  >Conflicting bases are condensed into IUPAC ambiguity codes                          |'
    print '|---------------------------------------------------------------------------------------|'                       
    print '| Modifiers:                                                                            |'
    print '|  -d  : Save debug log                                                                 |'
    print '|  -p  : Print output to standard out (usually the shell being called from)             |'
    print '|  -n  : mask all bases that have a total quality value score less than the given value |'
    print '\\---------------------------------------------------------------------------------------/'

#############################################################################################################################
#>function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10)               #
def mkRange(numList):
    outList = []
    start   = numList[0]
    current = numList[0]
    numList.append(numList[-1])
    for num in numList[1:]:
        if (num - 1) != current:
            outList.append([start,current])
            start = num
        current = num
    return outList
#############################################################################################################################

###Argument Interprtation####################################################################################################
#>Parses the arguments supplied by the user, or the default values if the script is being run on IDLE                       #
#>If no arguments are given, the help menu is printed                                                                       #
#>Modifiers and their arguments are isolated from the raw input for later processing (in Modifier Interpretation)           #
if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...
    if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied...
        if os.path.exists(argList[-1]) == 0: #if the path dose not exist
            debugLog.append('Error: Invalid file path to input data')
            errorExit() #end the program 
    elif argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu 
    else:
        debugLog.append('Error: Too few arguments supplied\n')
        errorExit() 
else: #If the script is being imported on to IDLE 
    argList  = defArgList #use default arguments 
    argNum   = len(defArgList)
    if argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu
    savePath = defSavePath #sets the save path to the default, specified in the variable initialization section 
    debugLog.append('Alert: default debugging input arguments are being used\n')
if helpOnly == False: #if arguments were supplied...
    qualPath = argList[-1] #the path to the qual file containing the input data 
    if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments)
        modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their arguments
#############################################################################################################################
        
if helpOnly == False:

    ###Modifier Interpretation###############################################################################################
    #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation"     #
    #>Given arguments are compared against a list of known arguments                                                        #
    #>Matches found change the appropriate variable for the desired effect of the modifier the script                       #
    if len(modifiers) > 0: #if modifiers are supplied
        for mod in modifiers: #loops through the list of modifiers and modifier arguments
            if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters         
                activeMods += mod[1:] #sorts the modifiers into activeMods...
            else:  
                modArgs.append(mod) #assumes everything else to be a modifier argument
        for letter in activeMods: #checks if the modifiers are recognized  
            if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script
                debugLog.append('Warning: Unexpected modifier: ' + letter + '\n')
                warning = True #if the input modifier is not found
            else:
                if letter == 'd': #if -d is supplied...
                    saveDebug = True #The debug log will be saved to the current working directory 
                if letter == 'p': #if -p is supplied...
                    printOut = True #The results will be printed to the standard output (usually the screen)
                if letter == 'n': #if -n is supplied...
                    if len(modArgs) > 0: #if there is at least one non-processed modifier argument 
                        minQualScore = int(modArgs[0]) #The string specifying the desired lines to process from the input file
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
    #########################################################################################################################

    ###Quality Value File Parsing############################################################################################
    qualHandle = open(qualPath, 'r') #opens the file containing the quality values
    qualRaw    = qualHandle.readlines() #saves all of the file into qualRaw
    qualHandle.close() #closes the file object
    qualName = qualRaw[2][:-1] #the name of the first contig is the 3rd line minus the newline
    qualQuals = [] #will contain a list of quality values for each contig 
    qualData = [] #the data structure holding the parsed data 
    qualRaw.append('randomline') #place mark needed to process the last contig
    for line in qualRaw[3:]: #for every line past the first 4...
        splitLine = string.split(line) #make a list of all the parts of line separated by whitespace
        if len(splitLine) == 1 and splitLine[0].isdigit() == False: #if there is only one "word" (i.e. it is a name of a contig)
            qualData.append([qualName,qualQuals]) #add the previous contig to the parsed data 
            qualName = splitLine[0] #new contig name 
            qualQuals = [] 
        else: #if the line is a set of quality values...
            qualQuals.append(splitLine) #append them to the list of quality values for this contig
    #########################################################################################################################

    ###Low-Value quality masking#############################################################################################
    if minQualScore > 0: #if the -n modifier was supplied...
        for cntgIndex in range(0,len(qualData)): #for every contig...
            for baseIndex in range(0,len(qualData[cntgIndex][1])): #for every base in every contig...
                baseDepth = (len(qualData[cntgIndex][1][baseIndex]) - 1) / 7 #the number of distinct bases per base number
                for depth in range(1,baseDepth + 1): #for every quality value for every base in every contig
                    qualIndex = (depth * 7) #the index of the quality value in question 
                    charIndex = qualIndex - 6 #the index of the character value for that quality value
                    if int(qualData[cntgIndex][1][baseIndex][qualIndex]) < minQualScore: #if the quality value is less then specified amount 
                        qualData[cntgIndex][1][baseIndex][charIndex] = maskingChar #mask that quality value character
    #########################################################################################################################

    ###Raw Sequence Extraction###############################################################################################
    rawSeq = [] #will hold a list of sequences derived from the quality value data (ex: [[[A,C,N],[N],[A],[A,C]],...]) 
    for cntgIndex in range(0,len(qualData)): #for every contig...
        cntgSeq = []
        for baseIndex in range(0,len(qualData[cntgIndex][1])): #for every base of every contig
            baseValues = [] #will contain a list of all the character values for that base index on that contig 
            baseDepth = (len(qualData[cntgIndex][1][baseIndex]) - 1) / 7 #the number of distinct bases (quality value data) per index
            for depth in range(1,baseDepth + 1): #for every quality value for every base in every contig
                charIndex = (depth * 7) - 6 #the index of the character value for that quality value
                baseValues.append(qualData[cntgIndex][1][baseIndex][charIndex]) #add the character value to the list for that index
            cntgSeq.append(baseValues) #add the list of characters to the list for the contig 
        rawSeq.append(cntgSeq) #add the contig sequence
    #########################################################################################################################

    #########################################################################################################################
    #>Funtion used by the IUPAC function to remove repeated characters in list                                              #
    #>ex: ['A','C','C'] becomes ['A','C']                                                                                   #
    def removeRepeats(charList):
        for charIndex in range(0,len(charList)):
            for index in range(0,len(charList)):
                if charIndex != index and charList[charIndex] == charList[index]:
                    del charList[index]
                    return removeRepeats(charList)
        return charList
    #########################################################################################################################

    #########################################################################################################################
    #>Function that takes a list of base character values (ACTG-~N) and condenses them into a single IUPAC code             #
    #>Assumes only input characters to be ACTG-~N                                                                           #
    #>Ex: ['A','C'] becomes 'M'                                                                                             #
    def IUPAC(baseValues):      
        x = removeRepeats(baseValues)
        length = len(x)
        if length == 0:
            return '-'
        elif length == 1:
            return x[0]
        for index in range(-len(baseValues) + 1, 1):
            index *= -1
            if x[index] == '-' or x[index] == '~' or x[index] == 'N':
                del x[index]
        length = len(x)
        if length == 1:
            return x[0]
        elif length == 2:
            if (x[0] == 'C' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'C'):
                return 'Y'
            elif (x[0] == 'A' and x[1] == 'G') or (x[0] == 'G' and x[1] == 'A'):
                return 'R'
            elif (x[0] == 'A' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'A'):
                return 'W'
            elif (x[0] == 'C' and x[1] == 'G') or (x[0] == 'G' and x[1] == 'C'):
                return 'S'
            elif (x[0] == 'G' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'G'):
                return 'K'
            elif (x[0] == 'C' and x[1] == 'A') or (x[0] == 'A' and x[1] == 'C'):
                return 'M'
        elif length == 3:
            if x[0] != 'C' and x[1] != 'C' and x[2] != 'C':
                return 'D'
            elif x[0] != 'T' and x[1] != 'T' and x[2] != 'T':
                return 'V'
            elif x[0] != 'G' and x[1] != 'G' and x[2] != 'G':
                return 'H'
            elif x[0] != 'A' and x[1] != 'A' and x[2] != 'A':
                return 'B'
        else:
            return 'N'
    #########################################################################################################################

    ###Output Sequence generation############################################################################################
    #>Condenses any parts of overlapping contigs into IUPAC codes                                                           #
    outSeqs = [] #a list of contig names and sequences [[name,seq],[...],...]
    index = 0 #will correspond with the relevent index of rawSeq
    for seq in rawSeq: #for every sequence...
        outSeq = ''
        for charList in seq: #for every chararcter list for each base
            outSeq += IUPAC(charList) #condense the list into a sigle IUPAC code 
        outSeqs.append([qualData[index][0],outSeq]) #save name and sequence together 
        index += 1
    #########################################################################################################################

    ###Out file writing and saveing procedures###############################################################################
    fileSavePath = savePath + os.path.basename(qualPath) + '.fsa' #the path to where the output is saved
    outHandle = open(fileSavePath, 'w') #opens the file object for saving the output
    for seq in outSeqs: #prints every line of outSeqs to the output file...
        outHandle.write('>' + seq[0] + '\n')
        outHandle.write(seq[1] + '\n')
    if printOut: #if the -p modifier is supplied...
        for seq in outSeqs: #prints every line of outSeqs to the screen...
            print seq[0] 
            print seq[1]
    outHandle.close() #closes file object
    #########################################################################################################################

    ###Debug Saving procedures###############################################################################################
    if saveDebug: #if the -d modifier is supplied...
        debugHandle = open(savePath + os.path.basename(qualPath) + '_debug.txt', 'w')
        for line in debugLog:
            debugHandle.write(line)
        debugHandle.close()
    #########################################################################################################################