Fatomum.py

From OpenWetWare

Revision as of 18:50, 19 January 2010 by Zachary S. L. Foster (Talk | contribs)
(diff) ←Older revision | Current revision (diff) | Newer revision→ (diff)
Jump to: navigation, search

Copy and past the following into a text editor and save it as fatomum.py to use the script.

#!/usr/bin/env python

#############################################################################################################################
#fatomum.py [options] <fasta file containing alignment>                                                                     #
#Example: python fatomum.py -p ../myAlignment.fsa                                                                           #
#Version: 1.0   Last Modified: 1/15/2010     Author: Zachary S. L. Foster                                                   #
#---------------------------------------------------------------------------------------------------------------------------#
#>Argument: Path to a FASTA file containing a reference and its aligned contigs                                             # 
#---------------------------------------------------------------------------------------------------------------------------#    
#>Creates a MUMmer-style output from a FASTA file of aligned sequences                                                      #
#>The first line is always considered the reference and is not included in the output                                       #
#>The start index in terms of the reference, and in terms of the contig, as well as the length are recorded for each        #
#  matching section of the contig                                                                                           #
#---------------------------------------------------------------------------------------------------------------------------#
#Modifiers:                                                                                                                 #
#   -d      : save debug log; specify where to save the debug log       (default = current working directory)               #
#   -p      : print output                                                                                                  #
#   -m      : set minimum match length; anything less will not be included in the output; default = 10                      #
############################################################################################################################# 

###Imports / Variable Initilization##########################################################################################
import os, string, sys, time
defArgList  = ['fatomum.py','-p','-m','300','C:/Python26/balf.yasrahigh.koraref.CCassembly.fasta'] #argument list used during script testing the with Python GUI IDLE
argList     = sys.argv #argument list supplied by user
argNum      = len(argList) #the number of arguments supplied
minArgNum   = 1 #the smallest amount of arguments with which it is possible to run the script
saveDebug   = False #is True if the debug is to be saved
warning     = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug log
printOut    = False #is True when no the -p modifier is supplied; #The results will be printed to the standard output (usually the screen)
helpOnly    = False #is 'True' when no arguments are given; only help menu is printed
savePath    = './' #the directory in which the output will be saved
defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE
modifiers   = [] #eventually contains a list of all modifiers and their arguments supplied by the user
allMods     = 'dpm' #all the modifiers recognized by the script 
activeMods  = '' #a string containing all modifiers specified by the user 
modArgs     = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers
faData      = [] #will contain a list of each contig and its name 
outData     = [] #eventually holds the data that will be saved in the output file 
debugLog    = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied
minMatchLen = 20 #minimum match length; anything less will not be included in the output
#############################################################################################################################

#Error handling function#####################################################################################################
#>Is called when the script encounters a fatal error                                                                        #
#>Prints the debug log to standard out (usually the screen)                                                                 #
def errorExit():
    print 'The program was forced to exit prematurely, printing debug log...\n'
    for line in debugLog:
        print line
    sys.exit()
#############################################################################################################################

def printHelp():
    print '/---------------------------------------------------------------------------------------\\'
    print '| fatomum.py [options] <fasta file containing alignment>                               |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Example: python fatomum.py -p ../myAlignment.fsa                                      |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Version: 1.0   Last Modified: 1/15/2010     Author: Zachary S. L. Foster              |'
    print '|  >Creates a MUMmer-style output from a FASTA file of aligned sequences                |'
    print '|  >The first line is always considered the reference and is not included in the output |'
    print '|---------------------------------------------------------------------------------------|'                       
    print '| Modifiers:                                                                            |'
    print '|  -d  : Save debug log                                                                 |'
    print '|  -p  : Print output to standard out (usually the shell being called from)             |'
    print '|  -m  : set minimum match length; anything less will not be considered a match;        |'
    print '|        default = 10                                                                   |'
    print '\\---------------------------------------------------------------------------------------/'

###Argument Interprtation####################################################################################################
#>Parses the arguments supplied by the user, or the default values if the script is being run on IDLE                       #
#>If no arguments are given, the help menu is printed                                                                       #
#>Modifiers and their arguments are isolated from the raw input for later processing (in Modifier Interpretation)           #
if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...
    if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied...
        if os.path.exists(argList[-1]) == 0: #if the path dose not exist
            debugLog.append('Error: Invalid file path to input data')
            errorExit() #end the program 
    elif argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu 
    else:
        debugLog.append('Error: Too few arguments supplied\n')
        errorExit() 
else: #If the script is being imported on to IDLE 
    argList  = defArgList #use default arguments 
    argNum   = len(defArgList)
    if argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu
    savePath = defSavePath #sets the save path to the default, specified in the variable initialization section 
    debugLog.append('Alert: default debugging input arguments are being used\n')
if helpOnly == False: #if arguments were supplied...
    faPath = argList[-1] #the path to the FASTA file containing the input data 
    if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments)
        modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their arguments
#############################################################################################################################

if helpOnly == False:

    ###Modifier Interpretation###############################################################################################
    #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation"     #
    #>Given arguments are compared against a list of known arguments                                                        #
    #>Matches found change the appropriate variable for the desired effect of the modifier the script                       #
    if len(modifiers) > 0: #if modifiers are supplied
        for mod in modifiers: #loops through the list of modifiers and modifier arguments
            if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters         
                activeMods += mod[1:] #sorts the modifiers into activeMods...
            else:  
                modArgs.append(mod) #assumes everything else to be a modifier argument
        for letter in activeMods: #checks if the modifiers are recognized  
            if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script
                debugLog.append('Warning: Unexpected modifier: ' + letter + '\n')
                warning = True #if the input modifier is not found
            else:
                if letter == 'd': #if -d is supplied...
                    saveDebug = True #The debug log will be saved to the current working directory 
                if letter == 'p': #if -p is supplied...
                    printOut = True #The results will be printed to the standard output (usually the screen)
                if letter == 'm': #if -n is supplied...
                    if len(modArgs) > 0: #if there is at least one non-processed modifier argument 
                        minMatchLen = int(modArgs[0]) #The string specifying the desired lines to process from the input file
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
    #########################################################################################################################

    ###Input data file procedures and parsing################################################################################
    faHandle  = open(faPath, 'r') #opens the input data file 
    faRaw     = faHandle.readlines() #all information is transferred to faRaw
    faHandle.close() #file object is closed
    faRaw.append('> ') #allows the loop to process the last contig
    cntgSeq = ''
    for line in faRaw:
        if line[0] == '>': #if the line is the name of a contig...
            if len(cntgSeq) > 0: #if there is sequence saved from the previous contig 
                faData.append([cntgName,cntgSeq]) #add the previous contig to the list 
                cntgSeq = '' #clear the old sequence 
            cntgName = line[:-1] #new contig name is saved, [:-1] removes '\n'
        else: #if the line is a sequence...
            if line[0] == ' ': # if there is a space at the start of the line...
                line = line[1:] #remove the space
            cntgSeq += line[:-1] #add the sequence to any previous sequence encountered 
    reference = faData[0] #the first sequence is assumed to be the reference
    #########################################################################################################################

    ###MUMmer output emulation###############################################################################################
    outData = []
    for contig in faData[1:]: #for every sequence except the reference...
        cntgMatches = [] #a list of match data for each contig
        name = contig[0] #name of the contig
        index = 0 #reference base index
        inMatch = False #if True, then the current base index contains a match 
        refStart = 0 #the reference index of the first base of a match
        cntgStart = 0 #the contig index of the first base of a matches
        length = 0 #the length of the match
        firstSeqIndex = 0 #the first index of the contig that is not a '-'
        while contig[1][firstSeqIndex] == '-':
            firstSeqIndex += 1
        for bp in contig[1]: #for every base pair...
            if inMatch == True: 
                if ( bp == '-' or bp != reference[1][index] ): #if the current base is '-' or does not match the reference, while in a match, 
                    inMatch = False #therefore, not in a match
                    length = index - refStart #calculate the length of the match, index is the base after the last base of the match
                    if length >= minMatchLen: #if the match is long enough to be included in the output...
                        cntgMatches.append([refStart,cntgStart,length]) #include it
            elif bp != '-' and index < len(reference[1]) and bp == reference[1][index]: #if not in a match, but the currrent base is not '-' and matches the reference..
                inMatch = True #in a new match
                refStart = index #record start
                cntgStart = refStart - firstSeqIndex #start in terms of the contig 
            index +=1
        outData.append(name) #adds the name to the output, regardless of whether the contig has any matchs or not
        for match in cntgMatches: #if every match on this contig...
            outData.append('   ' + str(match[0]) + '\t' + str(match[1]) + '\t' + str(match[2])) #record a string version for the output 
    #########################################################################################################################

    ###Out file writing and saving procedures###############################################################################
    fileSavePath = savePath + os.path.basename(faPath) + '.fsa' #the path to where the output is saved
    outHandle = open(fileSavePath, 'w') #opens the file object for saving the output
    for line in outData: #prints every line of outSeqs to the output file...
        outHandle.write(line)
    if printOut: #if the -p modifier is supplied...
        for line in outData:
            print line
    outHandle.close() #closes file object
    #########################################################################################################################

    ###Debug Saving procedures###############################################################################################
    if saveDebug: #if the -d modifier is supplied...
        debugHandle = open(savePath + os.path.basename(qualPath) + '_debug.txt', 'w')
        for line in debugLog:
            debugHandle.write(line)
        debugHandle.close()
    #########################################################################################################################   

Personal tools