Gapstrip.py

From OpenWetWare

Jump to: navigation, search

Copy and past the following into a text editor and save it as gapstrip.py to use the script.

#!/usr/bin/env python 

#############################################################################################################################
#gapstrip [options] <fasta file>                                                                                            #   
#Example: python ../gapstrip -n 4 -p ../myfile.fsa                                                                          #
#Version: 1.0   Last Modified: 12/30/09     Author: Zachary S. L. Foster                                                    #
#---------------------------------------------------------------------------------------------------------------------------#
#Argument: A fasta file containing sequences aligned to a reference or each other. The first sequence is considered to the  #
#   reference by default                                                                                                    #
#---------------------------------------------------------------------------------------------------------------------------#
#>Searches for base indices at which all of the aligned sequences have a '-'                                                #
#>Ignores differences caused by the reference (a.k.a. first sequence in the input file by default), or the number of lines  #
#   specified by the -n modifier, if used.                                                                                  #
#>Removes the data for every sequence at each indice where all bases were '-', effectively removing each gap.               #
#---------------------------------------------------------------------------------------------------------------------------#
#Modifiers:                                                                                                                 #
#   -n      : specify number of top lines to ignore in gap strip        (default = 1)                                       #
#   -d      : save debug log to current working directory                                                                   #
#   -p      : print output                                                                                                  #
#############################################################################################################################

###Imports / Variable Initilization##########################################################################################
import os, string, sys
defArgList  = ['gapstrip.py','-p','-n','2','C:/Python26/gapStripDef.fsa'] #argument list used during script testing the with Python GUI IDLE   
argList     = sys.argv #argument list supplied by user
argNum      = len(argList) #the number of arguments supplied
minArgNum   = 1 #the smallest amount of arguments with which it is possible to run the script
saveDebug   = False #is True if the debug is to be saved
warning     = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug law
printOut    = False #is Ture if the -p modifier is supplied; the output will be printed to standard out (usually the screen)
helpOnly    = False #is 'True' when no arguments are given; only help menu is printed
debugLog    = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied
savePath    = './' #the directory in which the output will be saved
defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE 
modifiers   = [] #eventually contains a list of all modifiers and their arguments supplied by the user
allMods     = 'ndp' #all the modifiers recognized by the script 
activeMods  = '' #a string containing all modifiers specified by the user 
modArgs     = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers
faData      = [] #will hold all data from faRaw in the following format [[name of contig, sequence],[..]..]
outData     = []
editLog     = [] #a list of modifications made to the data in order to align it to the reference 
topNum      = 1 #indicates the number of sequences to ignore; can be changed with the -n modifier 
#############################################################################################################################

def printHelp():
    print '/---------------------------------------------------------------------------------------\\'
    print '| gapstrip [options] <fasta file>                                                       |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Example: python ../gapstrip -n 4 -p ../myfile.fsa                                     |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Version: 1.0   Last Modified: 12/30/09     Author: Zachary S. L. Foster               |'
    print '|  >Searches for base indices at which all of the aligned sequences have a -            |'
    print '|  >Ignores differences caused by the reference (a.k.a. first sequence in the input file|'
    print '|    by default), or the number of lines specified by the -n modifier, if used.         |'
    print '|  >Removes the data for every sequence at each index where all bases were -,           |'
    print '|    effectively removing each gap.                                                     |'
    print '|---------------------------------------------------------------------------------------|'                       
    print '| Modifiers:                                                                            |'
    print '|  -d  : Save debug log                                                                 |'
    print '|  -p  : Print output to standard out (usually the shell being called from)             |'
    print '|  -n  : specify number of top lines to ignore in gap strip        (default = 1)        |'
    print '\\---------------------------------------------------------------------------------------/'

#Error handling function#####################################################################################################
#>Is called when the script encounters a fatal error                                                                        #
#>Prints the debug log to standard out (usually the screen)
def errorExit():
    print 'The program was forced to exit prematurely, printing debug log...\n'
    for line in debugLog:
        print line
    sys.exit()
#############################################################################################################################                                                           

#############################################################################################################################
#>function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10)               #
def mkRange(numList):
    outList = []
    start   = numList[0]
    current = numList[0]
    numList.append(numList[-1])
    for num in numList[1:]:
        if (num - 1) != current:
            outList.append([start,current])
            start = num
        current = num
    return outList
#############################################################################################################################

###Argument Interprtation####################################################################################################
if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...
    if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied...
        if os.path.exists(argList[-1]) == 0: #if the path dose not exist
            debugLog.append('Error: Invalid file path to fasta file')
            errorExit() #end the program
    elif argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu
    else: 
        debugLog.append('Error: Too few arguments supplied\n') 
        errorExit() #end the program
else: #If the script is being imported on to IDLE 
   argList  = defArgList #use default arguments
   argNum   = len(defArgList)
   if argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu
   savePath = defSavePath #sets the save path to the default, specified in the variable initialization section 
   debugLog.append('Alert: default debugging input arguments are being used\n')
if helpOnly == False: #if arguments were supplied...
    faPath = argList[-1] #the path to the fasta file containing the input data 
    if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments)
        modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their argum
#############################################################################################################################


if helpOnly == False:
    ###Modifier Interpretation###############################################################################################
    #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation"     #
    #>Given arguments are compared against a list of known arguments                                                        #
    #>Matches found change the appropriate variable for the desired effect of the modifier the script                       #
    if len(modifiers) > 0: #if modifiers are supplied
        for mod in modifiers: #loops through the list of modifiers and modifier arguments
            if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters         
                activeMods += mod[1:] #sorts the modifiers into activeMods...
            else:  
                modArgs.append(mod) #assumes everything else to be a modifier argument
        for letter in activeMods: #checks if the modifiers are recognized  
            if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script
                debugLog.append('Warning: Unexpected modifier: ' + letter + '\n')
                warning = True #if the input modifier is not found
            else:
                if letter == 'd': #if -d is supplied...
                    saveDebug = True #The debug log will be saved to the current working directory 
                if letter == 'p': #if -p is supplied...
                    printOut = True #The results will be printed to the standard output (usually the screen)
                if letter == 'n': #if -n is supplied...
                    if len(modArgs) > 0: #if there is at least one non-processed modifier argument 
                        topNum = int(modArgs[0]) #The string specifying the desired lines to process from the input file
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
    #########################################################################################################################

    ###Query fasta input data parsing and file procedures####################################################################
    #>extracts the input data from the files at the supplied path                                                           #
    #>Parses the data into a list in the following format [[name of contig, sequence],[..]..]                               #
    faHandle  = open(faPath, 'r') #creates a file object
    faRaw     = faHandle.readlines() #all of the content of the input file in its raw form
    faHandle.close() #closes the file object
    faRaw.append('> ') #makes it so the last sequence can be processed in the following loop
    cntgSeq = ''
    cntgName = ''
    for line in faRaw: #loops through the lines of the input file
        if line[0] == '>': #if the first character of the line is a '>'
            if len(cntgName) > 0 and len(cntgSeq) > 0:
                faData.append([cntgName,cntgSeq])
            cntgSeq = ''
            cntgName = line[1:-1]       #[1:-1] removes '>' and '\n'
        else:
            if line[0] == ' ': #if the line begins with a space...
                line = line[1:] #remove the space
            if line[-1] == '\n': #if the line ends with a newline...
                line = line[:-1] #remove the newline
            cntgSeq += line #add contents of the line to the total sequence for this contig
    #########################################################################################################################

    ###Pointless spacer removal##############################################################################################
    delList = [] #will contain the indices at which all of the sequences have '-' and must be deleted 
    refData = faData[0][1] #the reference is assumed to be the first sequence (even if the -n modifier is set to 0)
    for refIndex in range(0,len(refData)): #loops through each index in terms of the reference 
        pointlessSpace = True #each index is assumed to be a space 
        for cntgIndex in range(topNum,len(faData)): #loops through all the sequences that are being checked for spaces 
            seqLen = len(faData[cntgIndex][1]) 
            if refIndex < seqLen: #if the index being tested is present on this contig... 
                bp = faData[cntgIndex][1][refIndex]
                if bp != '-': #if the base is not a space...
                    pointlessSpace = False
        if pointlessSpace:
            delList.append(refIndex)
    if len(delList) > 0:
        delList = mkRange(delList) #makes a list of indices into a list of index ranges
        delList.reverse() #reverses the order of the list to preserve the accuracy of the index values in delList as they are deleted from faData
        for theRange in delList: #for every range specified by delList...
            for cntgIndex in range(0,len(faData)): #for every contig is faData...
                faData[cntgIndex][1] = faData[cntgIndex][1][:theRange[0]] + faData[cntgIndex][1][theRange[1] + 1:] #remove the section determined to be spaces
    else:
        print 'No gaps found...'
    #########################################################################################################################

    ###Out file writing and saveing procedures###############################################################################
    fileSavePath = savePath + 'GapStrip.' + os.path.basename(faPath) #the path to where the output is saved 
    outHandle = open(fileSavePath, 'w') #opens the file object for saving the output
    for line in faData: #prints every line of outData to the output file...
        outHandle.write('>' + line[0] + '\n') #write contig name 
        outHandle.write(line[1] + '\n') #write contig sequence 
    if printOut: #if the -p modifier is supplied
        for line in faData: #prints every line to the standard output
            print '>' + line[0]
            print line[1]
    outHandle.close()
    #########################################################################################################################

    ###Debug Saving procedures###############################################################################################
    if saveDebug: #if the -d modifier is supplied
        fileSavePath = savePath + 'GSDebug.' + os.path.basename(faPath) #the path to where the debug log is to be saved 
        debugHandle = open(fileSavePath, 'w') #opens the file object for saving the debug log 
        for line in debugLog:
            debugHandle.write(line)
        debugHandle.close()
    #########################################################################################################################   
Personal tools