Qualtofa.py: Difference between revisions
From OpenWetWare
Jump to navigationJump to search
mNo edit summary |
mNo edit summary |
||
Line 1: | Line 1: | ||
Copy and past the following into a text editor and save it as sumqual.py to use the script. | |||
<pre> | <pre> | ||
#!/usr/bin/env python | #!/usr/bin/env python |
Revision as of 03:13, 13 January 2010
Copy and past the following into a text editor and save it as sumqual.py to use the script.
#!/usr/bin/env python ############################################################################################################################# #qualtofa [options] <quality file path> # #Example: python qualtofa.py -n 20 ../myValues.qual # #Version: 1.3 Last Modified: 12/30/09 Author: Zachary S. L. Foster # #---------------------------------------------------------------------------------------------------------------------------# #>Argument: path to a yasra quality value file (.qual) containing values for one or more sequences # #---------------------------------------------------------------------------------------------------------------------------# #>Selectively extracts the sequence from a consensus quality file and outputs it to a fasta file (.fa) # #>Conflicting bases are condensed into IUPAC ambiguity codes # #---------------------------------------------------------------------------------------------------------------------------# #Modifiers: # # -d : save debug log; specify where to save the debug log (default = current working directory) # # -e : save log of edits made to the contig (default = print to screen) # # -p : print output # # -n : replace all bases that have a total quality value score less than the specified amount # ############################################################################################################################# ###Imports / Variable Initilization########################################################################################## import os, string, sys, time defArgList = ['qualtofa.py','-n','30','C:/Python26/balf.mummeraligned.qual'] #argument list used during script testing the with Python GUI IDLE argList = sys.argv #argument list supplied by user argNum = len(argList) #the number of arguments supplied minArgNum = 1 #the smallest amount of arguments with which it is possible to run the script saveDebug = False #is True if the debug is to be saved warning = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug log printOut = False #is True when no the -p modifier is supplied; #The results will be printed to the standard output (usually the screen) helpOnly = False #is 'True' when no arguments are given; only help menu is printed maskingChar = 'N' #What represents a "unknown" base minQualScore= 0 #determined by the presence of the -n modifier savePath = './' #the directory in which the output will be saved defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE modifiers = [] #eventually contains a list of all modifiers and their arguments supplied by the user allMods = 'depn' #all the modifiers recognized by the script activeMods = '' #a string containing all modifiers specified by the user modArgs = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers qualData = [] outData = [] #eventually holds the data that will be saved in the output file debugLog = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied ############################################################################################################################# #Error handling function##################################################################################################### #>Is called when the script encounters a fatal error # #>Prints the debug log to standard out (usually the screen) # def errorExit(): print 'The program was forced to exit prematurely, printing debug log...\n' for line in debugLog: print line sys.exit() ############################################################################################################################# def printHelp(): print '/---------------------------------------------------------------------------------------\\' print '| qualtofa [options] <yasra quality file path> |' print '|---------------------------------------------------------------------------------------|' print '| Example: python qualtofa.py -n 20 ../myValues.qual |' print '|---------------------------------------------------------------------------------------|' print '| Version: 1.3 Last Modified: 12/30/09 Author: Zachary S. L. Foster |' print '| >Selectively extracts the sequence from a consensus quality file and outputs it to a |' print '| fasta file (.fa) |' print '| >Conflicting bases are condensed into IUPAC ambiguity codes |' print '|---------------------------------------------------------------------------------------|' print '| Modifiers: |' print '| -d : Save debug log |' print '| -p : Print output to standard out (usually the shell being called from) |' print '| -n : mask all bases that have a total quality value score less than the given value |' print '\\---------------------------------------------------------------------------------------/' ############################################################################################################################# #>function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10) # def mkRange(numList): outList = [] start = numList[0] current = numList[0] numList.append(numList[-1]) for num in numList[1:]: if (num - 1) != current: outList.append([start,current]) start = num current = num return outList ############################################################################################################################# ###Argument Interprtation#################################################################################################### #>Parses the arguments supplied by the user, or the default values if the script is being run on IDLE # #>If no arguments are given, the help menu is printed # #>Modifiers and their arguments are isolated from the raw input for later processing (in Modifier Interpretation) # if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE... if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied... if os.path.exists(argList[-1]) == 0: #if the path dose not exist debugLog.append('Error: Invalid file path to input data') errorExit() #end the program elif argNum == 1: #If no arguments are supplied... helpOnly = True printHelp() #prints help menu else: debugLog.append('Error: Too few arguments supplied\n') errorExit() else: #If the script is being imported on to IDLE argList = defArgList #use default arguments argNum = len(defArgList) if argNum == 1: #If no arguments are supplied... helpOnly = True printHelp() #prints help menu savePath = defSavePath #sets the save path to the default, specified in the variable initialization section debugLog.append('Alert: default debugging input arguments are being used\n') if helpOnly == False: #if arguments were supplied... qualPath = argList[-1] #the path to the qual file containing the input data if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments) modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their arguments ############################################################################################################################# if helpOnly == False: ###Modifier Interpretation############################################################################################### #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation" # #>Given arguments are compared against a list of known arguments # #>Matches found change the appropriate variable for the desired effect of the modifier the script # if len(modifiers) > 0: #if modifiers are supplied for mod in modifiers: #loops through the list of modifiers and modifier arguments if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters activeMods += mod[1:] #sorts the modifiers into activeMods... else: modArgs.append(mod) #assumes everything else to be a modifier argument for letter in activeMods: #checks if the modifiers are recognized if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script debugLog.append('Warning: Unexpected modifier: ' + letter + '\n') warning = True #if the input modifier is not found else: if letter == 'd': #if -d is supplied... saveDebug = True #The debug log will be saved to the current working directory if letter == 'p': #if -p is supplied... printOut = True #The results will be printed to the standard output (usually the screen) if letter == 'n': #if -n is supplied... if len(modArgs) > 0: #if there is at least one non-processed modifier argument minQualScore = int(modArgs[0]) #The string specifying the desired lines to process from the input file del modArgs[0] #the original argument is deleted from the list else: #if the list of modifier arguments is empty... print 'Error: Modifier argument not supplied\n' sys.exit() #exit the script ######################################################################################################################### ###Quality Value File Parsing############################################################################################ qualHandle = open(qualPath, 'r') #opens the file containing the quality values qualRaw = qualHandle.readlines() #saves all of the file into qualRaw qualHandle.close() #closes the file object qualName = qualRaw[2][:-1] #the name of the first contig is the 3rd line minus the newline qualQuals = [] #will contain a list of quality values for each contig qualData = [] #the data structure holding the parsed data qualRaw.append('randomline') #place mark needed to process the last contig for line in qualRaw[3:]: #for every line past the first 4... splitLine = string.split(line) #make a list of all the parts of line separated by whitespace if len(splitLine) == 1 and splitLine[0].isdigit() == False: #if there is only one "word" (i.e. it is a name of a contig) qualData.append([qualName,qualQuals]) #add the previous contig to the parsed data qualName = splitLine[0] #new contig name qualQuals = [] else: #if the line is a set of quality values... qualQuals.append(splitLine) #append them to the list of quality values for this contig ######################################################################################################################### ###Low-Value quality masking############################################################################################# if minQualScore > 0: #if the -n modifier was supplied... for cntgIndex in range(0,len(qualData)): #for every contig... for baseIndex in range(0,len(qualData[cntgIndex][1])): #for every base in every contig... baseDepth = (len(qualData[cntgIndex][1][baseIndex]) - 1) / 7 #the number of distinct bases per base number for depth in range(1,baseDepth + 1): #for every quality value for every base in every contig qualIndex = (depth * 7) #the index of the quality value in question charIndex = qualIndex - 6 #the index of the character value for that quality value if int(qualData[cntgIndex][1][baseIndex][qualIndex]) < minQualScore: #if the quality value is less then specified amount qualData[cntgIndex][1][baseIndex][charIndex] = maskingChar #mask that quality value character ######################################################################################################################### ###Raw Sequence Extraction############################################################################################### rawSeq = [] #will hold a list of sequences derived from the quality value data (ex: [[[A,C,N],[N],[A],[A,C]],...]) for cntgIndex in range(0,len(qualData)): #for every contig... cntgSeq = [] for baseIndex in range(0,len(qualData[cntgIndex][1])): #for every base of every contig baseValues = [] #will contain a list of all the character values for that base index on that contig baseDepth = (len(qualData[cntgIndex][1][baseIndex]) - 1) / 7 #the number of distinct bases (quality value data) per index for depth in range(1,baseDepth + 1): #for every quality value for every base in every contig charIndex = (depth * 7) - 6 #the index of the character value for that quality value baseValues.append(qualData[cntgIndex][1][baseIndex][charIndex]) #add the character value to the list for that index cntgSeq.append(baseValues) #add the list of characters to the list for the contig rawSeq.append(cntgSeq) #add the contig sequence ######################################################################################################################### ######################################################################################################################### #>Funtion used by the IUPAC function to remove repeated characters in list # #>ex: ['A','C','C'] becomes ['A','C'] # def removeRepeats(charList): for charIndex in range(0,len(charList)): for index in range(0,len(charList)): if charIndex != index and charList[charIndex] == charList[index]: del charList[index] return removeRepeats(charList) return charList ######################################################################################################################### ######################################################################################################################### #>Function that takes a list of base character values (ACTG-~N) and condenses them into a single IUPAC code # #>Assumes only input characters to be ACTG-~N # #>Ex: ['A','C'] becomes 'M' # def IUPAC(baseValues): x = removeRepeats(baseValues) length = len(x) if length == 0: return '-' elif length == 1: return x[0] for index in range(-len(baseValues) + 1, 1): index *= -1 if x[index] == '-' or x[index] == '~' or x[index] == 'N': del x[index] length = len(x) if length == 1: return x[0] elif length == 2: if (x[0] == 'C' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'C'): return 'Y' elif (x[0] == 'A' and x[1] == 'G') or (x[0] == 'G' and x[1] == 'A'): return 'R' elif (x[0] == 'A' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'A'): return 'W' elif (x[0] == 'C' and x[1] == 'G') or (x[0] == 'G' and x[1] == 'C'): return 'S' elif (x[0] == 'G' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'G'): return 'K' elif (x[0] == 'C' and x[1] == 'A') or (x[0] == 'A' and x[1] == 'C'): return 'M' elif length == 3: if x[0] != 'C' and x[1] != 'C' and x[2] != 'C': return 'D' elif x[0] != 'T' and x[1] != 'T' and x[2] != 'T': return 'V' elif x[0] != 'G' and x[1] != 'G' and x[2] != 'G': return 'H' elif x[0] != 'A' and x[1] != 'A' and x[2] != 'A': return 'B' else: return 'N' ######################################################################################################################### ###Output Sequence generation############################################################################################ #>Condenses any parts of overlapping contigs into IUPAC codes # outSeqs = [] #a list of contig names and sequences [[name,seq],[...],...] index = 0 #will correspond with the relevent index of rawSeq for seq in rawSeq: #for every sequence... outSeq = '' for charList in seq: #for every chararcter list for each base outSeq += IUPAC(charList) #condense the list into a sigle IUPAC code outSeqs.append([qualData[index][0],outSeq]) #save name and sequence together index += 1 ######################################################################################################################### ###Out file writing and saveing procedures############################################################################### fileSavePath = savePath + os.path.basename(qualPath) + '.fsa' #the path to where the output is saved outHandle = open(fileSavePath, 'w') #opens the file object for saving the output for seq in outSeqs: #prints every line of outSeqs to the output file... outHandle.write('>' + seq[0] + '\n') outHandle.write(seq[1] + '\n') if printOut: #if the -p modifier is supplied... for seq in outSeqs: #prints every line of outSeqs to the screen... print seq[0] print seq[1] outHandle.close() #closes file object ######################################################################################################################### ###Debug Saving procedures############################################################################################### if saveDebug: #if the -d modifier is supplied... debugHandle = open(savePath + os.path.basename(qualPath) + '_debug.txt', 'w') for line in debugLog: debugHandle.write(line) debugHandle.close() #########################################################################################################################