Fatomum.py

From OpenWetWare
Revision as of 16:50, 19 January 2010 by Zachary S. L. Foster (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigationJump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Copy and past the following into a text editor and save it as fatomum.py to use the script.

#!/usr/bin/env python

#############################################################################################################################
#fatomum.py [options] <fasta file containing alignment>                                                                     #
#Example: python fatomum.py -p ../myAlignment.fsa                                                                           #
#Version: 1.0   Last Modified: 1/15/2010     Author: Zachary S. L. Foster                                                   #
#---------------------------------------------------------------------------------------------------------------------------#
#>Argument: Path to a FASTA file containing a reference and its aligned contigs                                             # 
#---------------------------------------------------------------------------------------------------------------------------#    
#>Creates a MUMmer-style output from a FASTA file of aligned sequences                                                      #
#>The first line is always considered the reference and is not included in the output                                       #
#>The start index in terms of the reference, and in terms of the contig, as well as the length are recorded for each        #
#  matching section of the contig                                                                                           #
#---------------------------------------------------------------------------------------------------------------------------#
#Modifiers:                                                                                                                 #
#   -d      : save debug log; specify where to save the debug log       (default = current working directory)               #
#   -p      : print output                                                                                                  #
#   -m      : set minimum match length; anything less will not be included in the output; default = 10                      #
############################################################################################################################# 

###Imports / Variable Initilization##########################################################################################
import os, string, sys, time
defArgList  = ['fatomum.py','-p','-m','300','C:/Python26/balf.yasrahigh.koraref.CCassembly.fasta'] #argument list used during script testing the with Python GUI IDLE
argList     = sys.argv #argument list supplied by user
argNum      = len(argList) #the number of arguments supplied
minArgNum   = 1 #the smallest amount of arguments with which it is possible to run the script
saveDebug   = False #is True if the debug is to be saved
warning     = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug log
printOut    = False #is True when no the -p modifier is supplied; #The results will be printed to the standard output (usually the screen)
helpOnly    = False #is 'True' when no arguments are given; only help menu is printed
savePath    = './' #the directory in which the output will be saved
defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE
modifiers   = [] #eventually contains a list of all modifiers and their arguments supplied by the user
allMods     = 'dpm' #all the modifiers recognized by the script 
activeMods  = '' #a string containing all modifiers specified by the user 
modArgs     = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers
faData      = [] #will contain a list of each contig and its name 
outData     = [] #eventually holds the data that will be saved in the output file 
debugLog    = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied
minMatchLen = 20 #minimum match length; anything less will not be included in the output
#############################################################################################################################

#Error handling function#####################################################################################################
#>Is called when the script encounters a fatal error                                                                        #
#>Prints the debug log to standard out (usually the screen)                                                                 #
def errorExit():
    print 'The program was forced to exit prematurely, printing debug log...\n'
    for line in debugLog:
        print line
    sys.exit()
#############################################################################################################################

def printHelp():
    print '/---------------------------------------------------------------------------------------\\'
    print '| fatomum.py [options] <fasta file containing alignment>                               |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Example: python fatomum.py -p ../myAlignment.fsa                                      |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Version: 1.0   Last Modified: 1/15/2010     Author: Zachary S. L. Foster              |'
    print '|  >Creates a MUMmer-style output from a FASTA file of aligned sequences                |'
    print '|  >The first line is always considered the reference and is not included in the output |'
    print '|---------------------------------------------------------------------------------------|'                       
    print '| Modifiers:                                                                            |'
    print '|  -d  : Save debug log                                                                 |'
    print '|  -p  : Print output to standard out (usually the shell being called from)             |'
    print '|  -m  : set minimum match length; anything less will not be considered a match;        |'
    print '|        default = 10                                                                   |'
    print '\\---------------------------------------------------------------------------------------/'

###Argument Interprtation####################################################################################################
#>Parses the arguments supplied by the user, or the default values if the script is being run on IDLE                       #
#>If no arguments are given, the help menu is printed                                                                       #
#>Modifiers and their arguments are isolated from the raw input for later processing (in Modifier Interpretation)           #
if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...
    if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied...
        if os.path.exists(argList[-1]) == 0: #if the path dose not exist
            debugLog.append('Error: Invalid file path to input data')
            errorExit() #end the program 
    elif argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu 
    else:
        debugLog.append('Error: Too few arguments supplied\n')
        errorExit() 
else: #If the script is being imported on to IDLE 
    argList  = defArgList #use default arguments 
    argNum   = len(defArgList)
    if argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu
    savePath = defSavePath #sets the save path to the default, specified in the variable initialization section 
    debugLog.append('Alert: default debugging input arguments are being used\n')
if helpOnly == False: #if arguments were supplied...
    faPath = argList[-1] #the path to the FASTA file containing the input data 
    if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments)
        modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their arguments
#############################################################################################################################

if helpOnly == False:

    ###Modifier Interpretation###############################################################################################
    #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation"     #
    #>Given arguments are compared against a list of known arguments                                                        #
    #>Matches found change the appropriate variable for the desired effect of the modifier the script                       #
    if len(modifiers) > 0: #if modifiers are supplied
        for mod in modifiers: #loops through the list of modifiers and modifier arguments
            if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters         
                activeMods += mod[1:] #sorts the modifiers into activeMods...
            else:  
                modArgs.append(mod) #assumes everything else to be a modifier argument
        for letter in activeMods: #checks if the modifiers are recognized  
            if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script
                debugLog.append('Warning: Unexpected modifier: ' + letter + '\n')
                warning = True #if the input modifier is not found
            else:
                if letter == 'd': #if -d is supplied...
                    saveDebug = True #The debug log will be saved to the current working directory 
                if letter == 'p': #if -p is supplied...
                    printOut = True #The results will be printed to the standard output (usually the screen)
                if letter == 'm': #if -n is supplied...
                    if len(modArgs) > 0: #if there is at least one non-processed modifier argument 
                        minMatchLen = int(modArgs[0]) #The string specifying the desired lines to process from the input file
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
    #########################################################################################################################

    ###Input data file procedures and parsing################################################################################
    faHandle  = open(faPath, 'r') #opens the input data file 
    faRaw     = faHandle.readlines() #all information is transferred to faRaw
    faHandle.close() #file object is closed
    faRaw.append('> ') #allows the loop to process the last contig
    cntgSeq = ''
    for line in faRaw:
        if line[0] == '>': #if the line is the name of a contig...
            if len(cntgSeq) > 0: #if there is sequence saved from the previous contig 
                faData.append([cntgName,cntgSeq]) #add the previous contig to the list 
                cntgSeq = '' #clear the old sequence 
            cntgName = line[:-1] #new contig name is saved, [:-1] removes '\n'
        else: #if the line is a sequence...
            if line[0] == ' ': # if there is a space at the start of the line...
                line = line[1:] #remove the space
            cntgSeq += line[:-1] #add the sequence to any previous sequence encountered 
    reference = faData[0] #the first sequence is assumed to be the reference
    #########################################################################################################################

    ###MUMmer output emulation###############################################################################################
    outData = []
    for contig in faData[1:]: #for every sequence except the reference...
        cntgMatches = [] #a list of match data for each contig
        name = contig[0] #name of the contig
        index = 0 #reference base index
        inMatch = False #if True, then the current base index contains a match 
        refStart = 0 #the reference index of the first base of a match
        cntgStart = 0 #the contig index of the first base of a matches
        length = 0 #the length of the match
        firstSeqIndex = 0 #the first index of the contig that is not a '-'
        while contig[1][firstSeqIndex] == '-':
            firstSeqIndex += 1
        for bp in contig[1]: #for every base pair...
            if inMatch == True: 
                if ( bp == '-' or bp != reference[1][index] ): #if the current base is '-' or does not match the reference, while in a match, 
                    inMatch = False #therefore, not in a match
                    length = index - refStart #calculate the length of the match, index is the base after the last base of the match
                    if length >= minMatchLen: #if the match is long enough to be included in the output...
                        cntgMatches.append([refStart,cntgStart,length]) #include it
            elif bp != '-' and index < len(reference[1]) and bp == reference[1][index]: #if not in a match, but the currrent base is not '-' and matches the reference..
                inMatch = True #in a new match
                refStart = index #record start
                cntgStart = refStart - firstSeqIndex #start in terms of the contig 
            index +=1
        outData.append(name) #adds the name to the output, regardless of whether the contig has any matchs or not
        for match in cntgMatches: #if every match on this contig...
            outData.append('   ' + str(match[0]) + '\t' + str(match[1]) + '\t' + str(match[2])) #record a string version for the output 
    #########################################################################################################################

    ###Out file writing and saving procedures###############################################################################
    fileSavePath = savePath + os.path.basename(faPath) + '.fsa' #the path to where the output is saved
    outHandle = open(fileSavePath, 'w') #opens the file object for saving the output
    for line in outData: #prints every line of outSeqs to the output file...
        outHandle.write(line)
    if printOut: #if the -p modifier is supplied...
        for line in outData:
            print line
    outHandle.close() #closes file object
    #########################################################################################################################

    ###Debug Saving procedures###############################################################################################
    if saveDebug: #if the -d modifier is supplied...
        debugHandle = open(savePath + os.path.basename(qualPath) + '_debug.txt', 'w')
        for line in debugLog:
            debugHandle.write(line)
        debugHandle.close()
    #########################################################################################################################