Fatomum.py: Difference between revisions
From OpenWetWare
Jump to navigationJump to search
mNo edit summary |
mNo edit summary |
||
Line 1: | Line 1: | ||
Copy and past the following into a text editor and save it as | Copy and past the following into a text editor and save it as fatomum.py to use the script. | ||
<pre> | <pre> | ||
#!/usr/bin/env python | #!/usr/bin/env python |
Latest revision as of 16:50, 19 January 2010
Copy and past the following into a text editor and save it as fatomum.py to use the script.
#!/usr/bin/env python ############################################################################################################################# #fatomum.py [options] <fasta file containing alignment> # #Example: python fatomum.py -p ../myAlignment.fsa # #Version: 1.0 Last Modified: 1/15/2010 Author: Zachary S. L. Foster # #---------------------------------------------------------------------------------------------------------------------------# #>Argument: Path to a FASTA file containing a reference and its aligned contigs # #---------------------------------------------------------------------------------------------------------------------------# #>Creates a MUMmer-style output from a FASTA file of aligned sequences # #>The first line is always considered the reference and is not included in the output # #>The start index in terms of the reference, and in terms of the contig, as well as the length are recorded for each # # matching section of the contig # #---------------------------------------------------------------------------------------------------------------------------# #Modifiers: # # -d : save debug log; specify where to save the debug log (default = current working directory) # # -p : print output # # -m : set minimum match length; anything less will not be included in the output; default = 10 # ############################################################################################################################# ###Imports / Variable Initilization########################################################################################## import os, string, sys, time defArgList = ['fatomum.py','-p','-m','300','C:/Python26/balf.yasrahigh.koraref.CCassembly.fasta'] #argument list used during script testing the with Python GUI IDLE argList = sys.argv #argument list supplied by user argNum = len(argList) #the number of arguments supplied minArgNum = 1 #the smallest amount of arguments with which it is possible to run the script saveDebug = False #is True if the debug is to be saved warning = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug log printOut = False #is True when no the -p modifier is supplied; #The results will be printed to the standard output (usually the screen) helpOnly = False #is 'True' when no arguments are given; only help menu is printed savePath = './' #the directory in which the output will be saved defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE modifiers = [] #eventually contains a list of all modifiers and their arguments supplied by the user allMods = 'dpm' #all the modifiers recognized by the script activeMods = '' #a string containing all modifiers specified by the user modArgs = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers faData = [] #will contain a list of each contig and its name outData = [] #eventually holds the data that will be saved in the output file debugLog = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied minMatchLen = 20 #minimum match length; anything less will not be included in the output ############################################################################################################################# #Error handling function##################################################################################################### #>Is called when the script encounters a fatal error # #>Prints the debug log to standard out (usually the screen) # def errorExit(): print 'The program was forced to exit prematurely, printing debug log...\n' for line in debugLog: print line sys.exit() ############################################################################################################################# def printHelp(): print '/---------------------------------------------------------------------------------------\\' print '| fatomum.py [options] <fasta file containing alignment> |' print '|---------------------------------------------------------------------------------------|' print '| Example: python fatomum.py -p ../myAlignment.fsa |' print '|---------------------------------------------------------------------------------------|' print '| Version: 1.0 Last Modified: 1/15/2010 Author: Zachary S. L. Foster |' print '| >Creates a MUMmer-style output from a FASTA file of aligned sequences |' print '| >The first line is always considered the reference and is not included in the output |' print '|---------------------------------------------------------------------------------------|' print '| Modifiers: |' print '| -d : Save debug log |' print '| -p : Print output to standard out (usually the shell being called from) |' print '| -m : set minimum match length; anything less will not be considered a match; |' print '| default = 10 |' print '\\---------------------------------------------------------------------------------------/' ###Argument Interprtation#################################################################################################### #>Parses the arguments supplied by the user, or the default values if the script is being run on IDLE # #>If no arguments are given, the help menu is printed # #>Modifiers and their arguments are isolated from the raw input for later processing (in Modifier Interpretation) # if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE... if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied... if os.path.exists(argList[-1]) == 0: #if the path dose not exist debugLog.append('Error: Invalid file path to input data') errorExit() #end the program elif argNum == 1: #If no arguments are supplied... helpOnly = True printHelp() #prints help menu else: debugLog.append('Error: Too few arguments supplied\n') errorExit() else: #If the script is being imported on to IDLE argList = defArgList #use default arguments argNum = len(defArgList) if argNum == 1: #If no arguments are supplied... helpOnly = True printHelp() #prints help menu savePath = defSavePath #sets the save path to the default, specified in the variable initialization section debugLog.append('Alert: default debugging input arguments are being used\n') if helpOnly == False: #if arguments were supplied... faPath = argList[-1] #the path to the FASTA file containing the input data if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments) modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their arguments ############################################################################################################################# if helpOnly == False: ###Modifier Interpretation############################################################################################### #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation" # #>Given arguments are compared against a list of known arguments # #>Matches found change the appropriate variable for the desired effect of the modifier the script # if len(modifiers) > 0: #if modifiers are supplied for mod in modifiers: #loops through the list of modifiers and modifier arguments if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters activeMods += mod[1:] #sorts the modifiers into activeMods... else: modArgs.append(mod) #assumes everything else to be a modifier argument for letter in activeMods: #checks if the modifiers are recognized if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script debugLog.append('Warning: Unexpected modifier: ' + letter + '\n') warning = True #if the input modifier is not found else: if letter == 'd': #if -d is supplied... saveDebug = True #The debug log will be saved to the current working directory if letter == 'p': #if -p is supplied... printOut = True #The results will be printed to the standard output (usually the screen) if letter == 'm': #if -n is supplied... if len(modArgs) > 0: #if there is at least one non-processed modifier argument minMatchLen = int(modArgs[0]) #The string specifying the desired lines to process from the input file del modArgs[0] #the original argument is deleted from the list else: #if the list of modifier arguments is empty... print 'Error: Modifier argument not supplied\n' sys.exit() #exit the script ######################################################################################################################### ###Input data file procedures and parsing################################################################################ faHandle = open(faPath, 'r') #opens the input data file faRaw = faHandle.readlines() #all information is transferred to faRaw faHandle.close() #file object is closed faRaw.append('> ') #allows the loop to process the last contig cntgSeq = '' for line in faRaw: if line[0] == '>': #if the line is the name of a contig... if len(cntgSeq) > 0: #if there is sequence saved from the previous contig faData.append([cntgName,cntgSeq]) #add the previous contig to the list cntgSeq = '' #clear the old sequence cntgName = line[:-1] #new contig name is saved, [:-1] removes '\n' else: #if the line is a sequence... if line[0] == ' ': # if there is a space at the start of the line... line = line[1:] #remove the space cntgSeq += line[:-1] #add the sequence to any previous sequence encountered reference = faData[0] #the first sequence is assumed to be the reference ######################################################################################################################### ###MUMmer output emulation############################################################################################### outData = [] for contig in faData[1:]: #for every sequence except the reference... cntgMatches = [] #a list of match data for each contig name = contig[0] #name of the contig index = 0 #reference base index inMatch = False #if True, then the current base index contains a match refStart = 0 #the reference index of the first base of a match cntgStart = 0 #the contig index of the first base of a matches length = 0 #the length of the match firstSeqIndex = 0 #the first index of the contig that is not a '-' while contig[1][firstSeqIndex] == '-': firstSeqIndex += 1 for bp in contig[1]: #for every base pair... if inMatch == True: if ( bp == '-' or bp != reference[1][index] ): #if the current base is '-' or does not match the reference, while in a match, inMatch = False #therefore, not in a match length = index - refStart #calculate the length of the match, index is the base after the last base of the match if length >= minMatchLen: #if the match is long enough to be included in the output... cntgMatches.append([refStart,cntgStart,length]) #include it elif bp != '-' and index < len(reference[1]) and bp == reference[1][index]: #if not in a match, but the currrent base is not '-' and matches the reference.. inMatch = True #in a new match refStart = index #record start cntgStart = refStart - firstSeqIndex #start in terms of the contig index +=1 outData.append(name) #adds the name to the output, regardless of whether the contig has any matchs or not for match in cntgMatches: #if every match on this contig... outData.append(' ' + str(match[0]) + '\t' + str(match[1]) + '\t' + str(match[2])) #record a string version for the output ######################################################################################################################### ###Out file writing and saving procedures############################################################################### fileSavePath = savePath + os.path.basename(faPath) + '.fsa' #the path to where the output is saved outHandle = open(fileSavePath, 'w') #opens the file object for saving the output for line in outData: #prints every line of outSeqs to the output file... outHandle.write(line) if printOut: #if the -p modifier is supplied... for line in outData: print line outHandle.close() #closes file object ######################################################################################################################### ###Debug Saving procedures############################################################################################### if saveDebug: #if the -d modifier is supplied... debugHandle = open(savePath + os.path.basename(qualPath) + '_debug.txt', 'w') for line in debugLog: debugHandle.write(line) debugHandle.close() #########################################################################################################################