Gapstrip.py: Difference between revisions
From OpenWetWare
Jump to navigationJump to search
mNo edit summary |
No edit summary |
||
Line 1: | Line 1: | ||
Copy and past the following into a text editor and save it as | Copy and past the following into a text editor and save it as gapstrip.py to use the script. | ||
<pre> | <pre> | ||
#!/usr/bin/env python | #!/usr/bin/env python |
Latest revision as of 17:46, 14 January 2010
Copy and past the following into a text editor and save it as gapstrip.py to use the script.
#!/usr/bin/env python ############################################################################################################################# #gapstrip [options] <fasta file> # #Example: python ../gapstrip -n 4 -p ../myfile.fsa # #Version: 1.0 Last Modified: 12/30/09 Author: Zachary S. L. Foster # #---------------------------------------------------------------------------------------------------------------------------# #Argument: A fasta file containing sequences aligned to a reference or each other. The first sequence is considered to the # # reference by default # #---------------------------------------------------------------------------------------------------------------------------# #>Searches for base indices at which all of the aligned sequences have a '-' # #>Ignores differences caused by the reference (a.k.a. first sequence in the input file by default), or the number of lines # # specified by the -n modifier, if used. # #>Removes the data for every sequence at each indice where all bases were '-', effectively removing each gap. # #---------------------------------------------------------------------------------------------------------------------------# #Modifiers: # # -n : specify number of top lines to ignore in gap strip (default = 1) # # -d : save debug log to current working directory # # -p : print output # ############################################################################################################################# ###Imports / Variable Initilization########################################################################################## import os, string, sys defArgList = ['gapstrip.py','-p','-n','2','C:/Python26/gapStripDef.fsa'] #argument list used during script testing the with Python GUI IDLE argList = sys.argv #argument list supplied by user argNum = len(argList) #the number of arguments supplied minArgNum = 1 #the smallest amount of arguments with which it is possible to run the script saveDebug = False #is True if the debug is to be saved warning = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug law printOut = False #is Ture if the -p modifier is supplied; the output will be printed to standard out (usually the screen) helpOnly = False #is 'True' when no arguments are given; only help menu is printed debugLog = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied savePath = './' #the directory in which the output will be saved defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE modifiers = [] #eventually contains a list of all modifiers and their arguments supplied by the user allMods = 'ndp' #all the modifiers recognized by the script activeMods = '' #a string containing all modifiers specified by the user modArgs = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers faData = [] #will hold all data from faRaw in the following format [[name of contig, sequence],[..]..] outData = [] editLog = [] #a list of modifications made to the data in order to align it to the reference topNum = 1 #indicates the number of sequences to ignore; can be changed with the -n modifier ############################################################################################################################# def printHelp(): print '/---------------------------------------------------------------------------------------\\' print '| gapstrip [options] <fasta file> |' print '|---------------------------------------------------------------------------------------|' print '| Example: python ../gapstrip -n 4 -p ../myfile.fsa |' print '|---------------------------------------------------------------------------------------|' print '| Version: 1.0 Last Modified: 12/30/09 Author: Zachary S. L. Foster |' print '| >Searches for base indices at which all of the aligned sequences have a - |' print '| >Ignores differences caused by the reference (a.k.a. first sequence in the input file|' print '| by default), or the number of lines specified by the -n modifier, if used. |' print '| >Removes the data for every sequence at each index where all bases were -, |' print '| effectively removing each gap. |' print '|---------------------------------------------------------------------------------------|' print '| Modifiers: |' print '| -d : Save debug log |' print '| -p : Print output to standard out (usually the shell being called from) |' print '| -n : specify number of top lines to ignore in gap strip (default = 1) |' print '\\---------------------------------------------------------------------------------------/' #Error handling function##################################################################################################### #>Is called when the script encounters a fatal error # #>Prints the debug log to standard out (usually the screen) def errorExit(): print 'The program was forced to exit prematurely, printing debug log...\n' for line in debugLog: print line sys.exit() ############################################################################################################################# ############################################################################################################################# #>function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10) # def mkRange(numList): outList = [] start = numList[0] current = numList[0] numList.append(numList[-1]) for num in numList[1:]: if (num - 1) != current: outList.append([start,current]) start = num current = num return outList ############################################################################################################################# ###Argument Interprtation#################################################################################################### if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE... if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied... if os.path.exists(argList[-1]) == 0: #if the path dose not exist debugLog.append('Error: Invalid file path to fasta file') errorExit() #end the program elif argNum == 1: #If no arguments are supplied... helpOnly = True printHelp() #prints help menu else: debugLog.append('Error: Too few arguments supplied\n') errorExit() #end the program else: #If the script is being imported on to IDLE argList = defArgList #use default arguments argNum = len(defArgList) if argNum == 1: #If no arguments are supplied... helpOnly = True printHelp() #prints help menu savePath = defSavePath #sets the save path to the default, specified in the variable initialization section debugLog.append('Alert: default debugging input arguments are being used\n') if helpOnly == False: #if arguments were supplied... faPath = argList[-1] #the path to the fasta file containing the input data if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments) modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their argum ############################################################################################################################# if helpOnly == False: ###Modifier Interpretation############################################################################################### #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation" # #>Given arguments are compared against a list of known arguments # #>Matches found change the appropriate variable for the desired effect of the modifier the script # if len(modifiers) > 0: #if modifiers are supplied for mod in modifiers: #loops through the list of modifiers and modifier arguments if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters activeMods += mod[1:] #sorts the modifiers into activeMods... else: modArgs.append(mod) #assumes everything else to be a modifier argument for letter in activeMods: #checks if the modifiers are recognized if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script debugLog.append('Warning: Unexpected modifier: ' + letter + '\n') warning = True #if the input modifier is not found else: if letter == 'd': #if -d is supplied... saveDebug = True #The debug log will be saved to the current working directory if letter == 'p': #if -p is supplied... printOut = True #The results will be printed to the standard output (usually the screen) if letter == 'n': #if -n is supplied... if len(modArgs) > 0: #if there is at least one non-processed modifier argument topNum = int(modArgs[0]) #The string specifying the desired lines to process from the input file del modArgs[0] #the original argument is deleted from the list else: #if the list of modifier arguments is empty... print 'Error: Modifier argument not supplied\n' sys.exit() #exit the script ######################################################################################################################### ###Query fasta input data parsing and file procedures#################################################################### #>extracts the input data from the files at the supplied path # #>Parses the data into a list in the following format [[name of contig, sequence],[..]..] # faHandle = open(faPath, 'r') #creates a file object faRaw = faHandle.readlines() #all of the content of the input file in its raw form faHandle.close() #closes the file object faRaw.append('> ') #makes it so the last sequence can be processed in the following loop cntgSeq = '' cntgName = '' for line in faRaw: #loops through the lines of the input file if line[0] == '>': #if the first character of the line is a '>' if len(cntgName) > 0 and len(cntgSeq) > 0: faData.append([cntgName,cntgSeq]) cntgSeq = '' cntgName = line[1:-1] #[1:-1] removes '>' and '\n' else: if line[0] == ' ': #if the line begins with a space... line = line[1:] #remove the space if line[-1] == '\n': #if the line ends with a newline... line = line[:-1] #remove the newline cntgSeq += line #add contents of the line to the total sequence for this contig ######################################################################################################################### ###Pointless spacer removal############################################################################################## delList = [] #will contain the indices at which all of the sequences have '-' and must be deleted refData = faData[0][1] #the reference is assumed to be the first sequence (even if the -n modifier is set to 0) for refIndex in range(0,len(refData)): #loops through each index in terms of the reference pointlessSpace = True #each index is assumed to be a space for cntgIndex in range(topNum,len(faData)): #loops through all the sequences that are being checked for spaces seqLen = len(faData[cntgIndex][1]) if refIndex < seqLen: #if the index being tested is present on this contig... bp = faData[cntgIndex][1][refIndex] if bp != '-': #if the base is not a space... pointlessSpace = False if pointlessSpace: delList.append(refIndex) if len(delList) > 0: delList = mkRange(delList) #makes a list of indices into a list of index ranges delList.reverse() #reverses the order of the list to preserve the accuracy of the index values in delList as they are deleted from faData for theRange in delList: #for every range specified by delList... for cntgIndex in range(0,len(faData)): #for every contig is faData... faData[cntgIndex][1] = faData[cntgIndex][1][:theRange[0]] + faData[cntgIndex][1][theRange[1] + 1:] #remove the section determined to be spaces else: print 'No gaps found...' ######################################################################################################################### ###Out file writing and saveing procedures############################################################################### fileSavePath = savePath + 'GapStrip.' + os.path.basename(faPath) #the path to where the output is saved outHandle = open(fileSavePath, 'w') #opens the file object for saving the output for line in faData: #prints every line of outData to the output file... outHandle.write('>' + line[0] + '\n') #write contig name outHandle.write(line[1] + '\n') #write contig sequence if printOut: #if the -p modifier is supplied for line in faData: #prints every line to the standard output print '>' + line[0] print line[1] outHandle.close() ######################################################################################################################### ###Debug Saving procedures############################################################################################### if saveDebug: #if the -d modifier is supplied fileSavePath = savePath + 'GSDebug.' + os.path.basename(faPath) #the path to where the debug log is to be saved debugHandle = open(fileSavePath, 'w') #opens the file object for saving the debug log for line in debugLog: debugHandle.write(line) debugHandle.close() #########################################################################################################################