Basediff.py: Difference between revisions

From OpenWetWare
Jump to navigationJump to search
mNo edit summary
mNo edit summary
 
Line 1: Line 1:
Copy and past the following into a text editor and save it as sumqual.py to use the script.
Copy and past the following into a text editor and save it as basediff.py to use the script.
<pre>
<pre>
#!/usr/bin/env python
#!/usr/bin/env python

Latest revision as of 17:47, 14 January 2010

Copy and past the following into a text editor and save it as basediff.py to use the script.

#!/usr/bin/env python

#############################################################################################################################
#basediff.py [options] <path to aligned fasta file>                                                                         #
#Example: python basediff.py -i -# 1,2-5,7-9 ../myFile.fsa                                                                  #
#Version: 1.2   Last Modified: 12/22/2009                                                                                   #
#---------------------------------------------------------------------------------------------------------------------------#
#>Argument 1: path to fasta file containing aligned sequences                                                               #
#---------------------------------------------------------------------------------------------------------------------------#
#>Finds base differences between aligned sequences in a single file                                                         #
#>Output a tab-delimitated list of differing sets of bases                                                                  #
#---------------------------------------------------------------------------------------------------------------------------#
#Modifiers:                                                                                                                 #
#   -d      : include differences caused by - or ~                                                                          #
#   -i      : include differences caused by IUPAC nucleic acid codes                                                        #
#   -n      : include differences caused by N's                                                                             #
#   -p      : print output                                                                                                  #
#   -s      : include differences caused by absence of sequence (aka unequal sequence length)                               #  
#   -# arg  : specify which sequences in a file to compare; Example: -# 1,2,5-9,11                                          #
#############################################################################################################################

###Imports / Variable Initilization##########################################################################################
import os, string, sys, time
defArgList  = ['basediff.py','-s','C:/Python26/basediffdef.fa'] #argument list used during script testing the with Python GUI IDLE 
argList     = sys.argv #argument list supplied by user
argNum      = len(argList) #the number of arguments supplied 
minArgNum   = 1 #the smallest amount of arguments with which it is possible to run the script 
warning     = False #is set to 'True' if the program encounters any minor errors during the analysis
printOut    = False #is set to 'True' if the '-p' modifier is used
helpOnly    = False #is set to 'True' if no arguments are supplied; causes the help text to be printed
compChars   = 'AGTCU' #All the nucleic acid codes recognized by the script; expanded by the -i, -n, and -d modifiers
lineNumIn   = 'ALL' #contains the input for the '-#' modifier, specifies which lines in the input file are used in the analysis 
savePath    = './'  #the directory in which the output will be saved
defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE 
modifiers   = [] #eventually contains a list of all modifiers and their arguments supplied by the user
allMods     = 'dinps#' #all the modifiers recognized by the script 
activeMods  = '' #a string containing all modifiers specified by the user 
modArgs     = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers
#############################################################################################################################

def printHelp():
    print '/---------------------------------------------------------------------------------------\'
    print '| basediff.py [options] <path to aligned fasta file>                                    |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Example: python basediff.py -i -# 1,2-5,7-9 ../myFile.fsa                             |'
    print '|---------------------------------------------------------------------------------------|'
    print '| Version: 1.2   Last Edited: 12/21/2009                                                |'
    print '|\t>Finds base differences between aligned sequences in a single file                   |'
    print '|\t>Output a tab-delimitated list of differing sets of bases                            |'
    print '|---------------------------------------------------------------------------------------|'                       
    print '| Modifiers:                                                                            |'
    print '|\t-d\t: include differences caused by - or ~                                           |'
    print '|\t-i\t: include differences caused by IUPAC nucleic acid codes                         |'
    print '|\t-n\t: include difference caused by Ns                                                |'
    print '|\t-p\t: print output                                                                   |'
    print '|\t-s\t: include differences caused by absence of sequence (aka unequal sequence length)|'
    print '|\t-#\t: specify which sequences in a file to compare. ex: -# 1-3,5,9                   |'
    print '\---------------------------------------------------------------------------------------/'
        

###Argument Interprtation####################################################################################################
#>Parses the arguments supplied by the user, or the default values if the script is being run on IDLE                       #
#>If no arguments are given, the help menu is printed                                                                       #
#>Modifiers and their arguments are isolated from the raw input for later processing (in Modifier Interpretation)           #
if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...
    if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied...
        if os.path.exists(argList[-1]) == 0: #if the path dose not exist
            print 'Error: Invalid file path to input fasta file'
            sys.exit() #end the program 
    elif argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu 
    else:
        print 'Error: Too few arguments supplied\n'  
else: #If the script is being imported on to IDLE 
    argList  = defArgList #use default arguments 
    argNum   = len(defArgList)
    if argNum == 1: #If no arguments are supplied...
        helpOnly = True
        printHelp() #prints help menu
    savePath = defSavePath #sets the save path to the default, specified in the variable initialization section 
    print 'Alert: defualt debugging input arguments are being used\n'
if helpOnly == False: #if arguments were supplied...
    faPath = argList[-1] #the path to the fasta file containing the input data 
    if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments)
        modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their arguments
#############################################################################################################################

if helpOnly == False:
    ###Modifier Interpretation###############################################################################################
    #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation"     #
    #>Given arguments are compared against a list of known arguments                                                        #
    #>Matches found change the appropriate variable for the desired effect of the modifier the script                       #
    if len(modifiers) > 0: #if modifiers are supplied
        for mod in modifiers: #loops through the list of modifiers and modifier arguments 
            if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters        
                activeMods += mod[1:] #sorts the modifiers into activeMods...
            else:  
                modArgs.append(mod) #assumes everything else to be a modifier argument 
        for letter in activeMods: #loops through supplied modifiers    
            if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script 
                print 'argument not recognized' #if the input modifier is not found
            else: #if it is a recognized modifier...
                if letter == 'd': #if -d is supplied...
                    compChars += '-~' #-'s and ~'s are recognized by the script
                if letter == 'i': #if -i is supplied...
                    compChars += 'RYMKWSBDHV' #IUPAC nucleic acid codes are recognized by script 
                if letter == 'n': #if -n is supplied...
                    compChars += 'N' #N's are recognized by the script 
                if letter == 'p': #if -p is supplied...
                    printOut = True #The results will be printed to the standard output (usually the screen)
                if letter == 's': #if -s is supplied...
                    compChars += ' ' #absence of sequence, caused by sequences of diffrent length, is considered a difference  
                if letter == '#': #if -# is supplied...
                    if len(modArgs) > 0: #if there is at least one non-processed modifier argument 
                        lineNumIn = modArgs[0] #The string specifying the desired lines to process from the input file
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
    #########################################################################################################################

    ###Input data file procedures############################################################################################
    #>extracts the input data from the files at the supplied path                                                           #
    #>Parses the data into a list in the following format [[name of contig, sequence],[..]..]                               #
    #>interprates the -# modifier's argument and applies its effects to the parsed list                                     #
    faHandle  = open(faPath, 'r') #creates a file object 
    faRaw     = faHandle.readlines() #all of the content of the input file in its raw form 
    faData    = [] #will hold all data from faRaw in the following format [[name of contig, sequence],[..]..]
    faFinal   = [] #will hold the the lines from faData that lineNumIn specifies (if -# modifier is supplied)
    faHandle.close() #closes the file object
    faRaw.append('>') #makes it so the last sequence can be processed in the following loop
    name = faRaw[0][:-1] # initialized the variable for the loop, so that the first sequence can be included 
    seq  = ''
    for line in faRaw[1:]: #loops through the lines of the input file
        if line[0] == '>': #if the first character of the line is a '>'
            faData.append([name,seq])
            name = line[:-1]
            seq = ''
        else:
            seq += line[:-1] 
    if lineNumIn == 'ALL': #if the -# modifier is not supplied
        faFinal = faData #all lines are included in the analysis 
    else: # if the -# modifier is supplied
        indexList = []
        for num in string.split(lineNumIn,','): #loops through each entry separated by commas of the -# argument (e.g. 1,2,3-5,7)    
            if string.find(num,'-') != -1: #if the entry is a range of values 
                startNum = int(string.split(num,'-')[0]) #starting value of the range 
                endNum   = int(string.split(num,'-')[-1]) #ending value of the range 
                while startNum <= endNum: #repeats for each value from start to end (inclusive)
                    indexList.append(startNum - 1) #include the information at the current value in faFinal 
                    startNum += 1
            else: #if the entry is just a single number
                indexList.append(int(num) - 1) #include the information of that value in faFinal
        for index in indexList:
            if index < len(faData):
                faFinal.append(faData[index]) #include the information of that value in faFinal
    #########################################################################################################################
                 
    ###Base-by-base comparision##############################################################################################
    #>Finds the indices at which there are disagreements between the sequences and saved them in a list                     #
    maxLen = 0 #will hold the length of the longest contig 
    outIndexs = [] #a list of of the indices at which differences occur
    for cntg in faFinal: #loops through the contigs and finds the maximum length... 
        if len(cntg[1]) > maxLen:
            maxLen = len(cntg[1])
    for index in range(0,maxLen): #loops through the sequence indices          
        first = faFinal[0][1][index] #save the character at "index" in the first contig for comparison
        for cntg in faFinal: #loops through the contigs (still at a specific index) 
            if index >= len(cntg[1]): #if the length of the contig sequence is less than the current index being tested...
                cntg[1] += ' ' #the space character is used to indicate a lack of sequence
            if string.find(compChars,cntg[1][index].upper())!= -1: #if the character at "index" in "cntg" is included in the recognized characters 
                if first != cntg[1][index]: #if the character in the first contig is different from that at the same index in a different contig...
                    outIndexs.append(index) #add the index of the difference to the outIndexs list
                    break
    #########################################################################################################################

    ###Output data generation################################################################################################
    #>generates the text used in the output from the data of the previous section, "Base-by-base comparison"                #
    #>the schema is as follows: Base#\tContigName1\tContigName2\tContigName3...                                             #
    outData = [] #a list of strings corresponding to each line in the output file 
    schema = 'Base#' #Will hold the first line of the output file 
    for cntg in faFinal: #for every contig...
        schema += '\t' + cntg[0][1:] #add the name of the contig to the schema 
    outData.append(schema) #write the schema to outData 
    for index in outIndexs: #loops through the indicies of the bases determined to be diffrent in the above section
        baseOut = str(index + 1) #a string version of the index starting from 1 intead of 0
        for cntg in faFinal: #loops through all of the contigs
            #print index
            #print cntg
            if index < len(cntg[1]):
                baseOut += '\t' + str(cntg[1][index]) #adds the base value to the line, separated by tabs
        outData.append(baseOut) 
    #########################################################################################################################
          
    ###Out file writing and saveing procedures###############################################################################
    fileSavePath = savePath + 'BaseDiff.' + os.path.basename(faPath) + '.txt' #the path to where the output is saved 
    outHandle = open(fileSavePath, 'w') #opens the file object for saving the output  
    for line in outData: #prints every line of outData to the output file...
        outHandle.write(line + '\n')
    outHandle.close()
    if printOut: #if the -p modifier is supplied 
        for line in outData: #prints every line to the standard output
            print line
    #########################################################################################################################