Qualtofa.py

Copy and past the following into a text editor and save it as qualtofa.py to use the script.
#!/usr/bin/env python
 
#############################################################################################################################
# qualtofa [options] <YASRA/sumqual quality file path>                                                                      #
#---------------------------------------------------------------------------------------------------------------------------#
# Example: python qualtofa.py -i -c q 20 r -q 5 -s 20 ../myValues.qual                                                      #
#---------------------------------------------------------------------------------------------------------------------------#
# Version: 2.3   Last Modified: 4/19/2010     Author: Zachary S. L. Foster                                                  #
# Intended uses:                                                                                                            #
#  1)Produces alignments in FASTA format from sumquals output.                                                              #
#  2)Extract the sequences of multiple contigs from a YASRA quality file.                                                   #
#---------------------------------------------------------------------------------------------------------------------------#
# Modifiers:                                                                                                                #
#  -c  : Include each contig on its own line... (EX: -c q 20 r)                                                             #
#        q  : Set masking value for contigs (Analogous to -q modifier below; reference NOT needed).                         #
#        s* : Set SNP masking value for contigs (Analogous to -s modifier below).                                           #
#        r* : Do NOT align to the reference using '-'s for spacers.                                                         #
#  -d  : Save debug log to current working directory.                                                                       #
#  -i* : Condense all overlapping regions into IUPAC ambiguity codes...                                                     #
#        Default: Add deletions to the reference to accommodate the overlapping sequence.                                   #
#  -m* : Only include matching regions from the quality file.                                                               #
#  -p  : Print output to screen.                                                                                            #
#  -q* : Set minimum masking value (a new masked sequence will be added to the output file).                                #
#  -s* : Set minimum masking value for SNPs.                                                                                #
#  -a  : Include contained contigs (defalt: save sequences of contiained contigs to a seperate file)                        #
#  -t  : Trim the ends of the contigs by the specified number of bases.                                                     #
#  -v  : Trim all the bases on either end of all contigs that have a quality value less than the specified amount.          #
#  -b  : Verbose output; print progress reports and statistics.                                                             #
#  -l  : Set minimum match length                                                                                           #
#  -S  : Silent; Do not print anything to the standard output (aka screen)                                                  #
#  -n  : save SNPs to a .qual file 
#                                                                                                                           #
# *Note - these modifiers rely on the reference included in the output of sumqual. If your quality                          #
#         file is a list of unaligned contigs (e.g. from YASRA), these modifiers cannot be used.                            #
#############################################################################################################################


###Imports / Variable Initilization##########################################################################################
import os, string, sys, time, copy
defArgList  = ['qualtofa.py','-b','-c','s','1','-c','r','-i','-n','-s','100','C:/Python26/AGCT.filtered.delta.qual']
#defArgList  = ['qualtofa.py','-i','-q','20','-s','100','-c','-v','15','-b','-l','100','C:/Python26/out.delta.qual']
#defArgList  = ['qualtofa.py','-v','50','-c','q','20','s','100','C:/Python26/out.delta.qual']
#defArgList  = ['qualtofa.py','-t','10','-c','q','20','s','100','C:/Python26/out.delta.qual']
argList     = sys.argv #argumen list supplied by user
argNum      = len(argList) #the number of arguments supplied
minArgNum   = 1 #the smallest amount of arguments with which it is possible to run the script
saveDebug   = False #is True if the debug is to be saved
warning     = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug log
printOut    = False #is True when no the -p modifier is supplied; #The results will be printed to the standard output (usually the screen)
helpOnly    = False #is 'True' when no arguments are given; only help menu is printed
includeCntgs= False #is True when the -c modifier is used
IUPAC       = False #is True when the -i modifier is used
matchsOnly  = False #is True when the -m modifier is used
alignCntgs  = True #is False when the r argument of the -c modifier is used
SQaligned   = False #if the input file has the sumqual output format
verbose     = False #is True when the -b modifier is used
SNPs        = False
silent      = False
minMatchLen = 50 #Minimum match length accepted (after contig end trimming by -t and -v modifiers)
trimNum     = 0 #number of bases removed from the ends of all the contigs
trimQual    = 0 #minimum quality value needed to not be removed from the ends of the contigs
containedCngts = False #is True when the -a modifier is used
maskingChar = 'N' #What represents a "unknown" base
minQualScore= 0 #determined by the presence of the -q modifier
cntgQualScore= 0 #determined by the presence of the -c modifier, with argument q
cntgSNPScore= 0 #determined by the presence of the -c modifier, with argument s
minSNPScore = 0 #determined by the presence of the -s modifier
savePath    = './' #the directory in which the output will be saved
defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE
modifiers   = [] #eventually contains a list of all modifiers and their arguments supplied by the user
allMods     = 'cdeimpqastvblSn' #all the modifiers recognized by the script 
activeMods  = '' #a string containing all modifiers specified by the user 
modArgs     = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers
qualData    = []
refPath     = ''
refData     = []
outData     = [] #eventually holds the data that will be saved in the output file 
debugLog    = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied
#############################################################################################################################

#Error handling function#####################################################################################################
#>Is called when the script encounters a fatal error                                                                        #
#>Prints the debug log to standard out (usually the screen)                                                                 #
def errorExit():
    print 'The program was forced to exit prematurely, printing debug log...\n'
    for line in debugLog:
        print line
    sys.exit()
#############################################################################################################################

def printHelp(): #is called if no arguments are supplied
    print '/--------------------------------------------------------------------------------------------------\\'
    print '| qualtofa [options] <YASRA/sumqual quality file path>                                             |'
    print '|--------------------------------------------------------------------------------------------------|'
    print '| Example: python qualtofa.py -i -c q 20 r -q 5 -s 20 ../myValues.qual                             |'
    print '|--------------------------------------------------------------------------------------------------|'
    print '| verboseOut: 2.4   Last Modified: 6/24/2010     Author: Zachary S. L. Foster                          |'
    print '| Intended uses:                                                                                   |'
    print '|  1)Produces alignments in FASTA format from sumquals output.                                     |'
    print '|  2)Extract the sequences of multiple contigs from a YASRA quality file.                          |'
    print '|--------------------------------------------------------------------------------------------------|'                       
    print '| Modifiers:                                                                                       |'
    print '|  -c  : Include each contig on its own line... (EX: -c q 20 r)                                    |'
    print '|        q  : Set masking value for contigs (Analogous to -q modifier below; reference NOT needed).|'
    print '|        s* : Set SNP masking value for contigs (Analogous to -s modifier below).                  |'
    print '|        r* : Align to the reference using dashs for spacers.                                      |'
    print '|  -d  : Save debug log to current working directory.                                              |'
    print '|  -i* : Condense all overlapping regions into IUPAC ambiguity codes...                            |'
    print '|        Default: Add deletions to the reference to accommodate the overlapping sequence.          |'
    print '|  -m* : Only include matching regions from the quality file.                                      |'
    print '|  -p  : Print output to screen.                                                                   |'
    print '|  -q* : Set minimum masking value (a new masked sequence will be added to the output file).       |'
    print '|  -s* : Set minimum masking value for SNPs.                                                       |'
    print '|  -a  : Include contained contigs Defalt: save sequences of contiained contigs to a seperate file |'
    print '|  -t  : Trim the ends of the contigs by the specified number of bases.                            |'
    print '|  -v  : Trim all the bases on either end of all contigs that have a quality value less than       |'
    print '|        the specified amount.                                                                     |'
    print '|  -b  : Verbose output; print progress reports and statistics.                                    |'
    print '|  -l  : Set minimum match length                                                                  |'
    print '|  -S  : Silent; Do not print anything to the standard output (aka screen)                         |'
    print '|                                                                                                  |'
    print '| *Note - these modifiers rely on the reference included in the output of sumqual. If your quality |'
    print '|         file is a list of unaligned contigs (e.g. from YASRA), these modifiers cannot be used.   |'
    print '\\--------------------------------------------------------------------------------------------------/'

#############################################################################################################################
#>function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10)               #
def mkRange(numList):
    outList = []
    start   = numList[0]
    current = numList[0]
    numList.append(numList[-1])
    for num in numList[1:]:
        if (num - 1) != current:
            outList.append([start,current])
            start = num
        current = num
    return outList
#############################################################################################################################

###Argument Interprtation####################################################################################################
#>Parses the arguments supplied by the user, or the default values if the script is being run on IDLE                       #
#>If no arguments are given, the help menu is printed                                                                       #
#>Modifiers and their arguments are isolated from the raw input for later processing (in Modifier Interpretation)           #
#if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...
if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied...
    if os.path.exists(argList[-1]) == 0: #if the path dose not exist
        debugLog.append('Error: Invalid file path to input data')
        errorExit() #end the program 
elif argNum == 1: #If no arguments are supplied...
    helpOnly = True
    printHelp() #prints help menu 
else:
    debugLog.append('Error: Too few arguments supplied\n')
    errorExit() 
#else: #If the script is being imported on to IDLE 
#    argList  = defArgList #use default arguments 
#    argNum   = len(defArgList)
#    if argNum == 1: #If no arguments are supplied...
#        helpOnly = True
#        printHelp() #prints help menu
#    savePath = defSavePath #sets the save path to the default, specified in the variable initialization section 
#    debugLog.append('Alert: default debugging input arguments are being used\n')
if helpOnly == False: #if arguments were supplied...
    qualPath = argList[-1] #the path to the qual file containing the input data 
    if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments)
        modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their arguments
#############################################################################################################################
        
if helpOnly == False:

    ###Modifier Interpretation###############################################################################################
    #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation"     #
    #>Given arguments are compared against a list of known arguments                                                        #
    #>Matches found change the appropriate variable for the desired effect of the modifier the script                       #
    if len(modifiers) > 0: #if modifiers are supplied
        for mod in modifiers: #loops through the list of modifiers and modifier arguments
            if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters         
                activeMods += mod[1:] #sorts the modifiers into activeMods...
            else:  
                modArgs.append(mod) #assumes everything else to be a modifier argument
        for letter in activeMods: #checks if the modifiers are recognized  
            if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script
                debugLog.append('Warning: Unexpected modifier: ' + letter + '\n')
                warning = True #if the input modifier is not found
            else:
                if letter == 'c': #if -c is supplied...
                    if len(modArgs) > 0 and modArgs[0] == 'q': #if there is at least two non-processed modifier arguments
                        if len(modArgs) > 1:
                            cntgQualScore = int(modArgs[1]) 
                            del modArgs[:2] #the modifier arguments are deleted from the list
                        else:
                            debugLog.append('Error: Modifier argument not supplied\n')
                            errorExit() #exit the script
                    if len(modArgs) > 0 and modArgs[0] == 's': #if there is at least two non-processed modifier arguments
                        if len(modArgs) > 1:
                            cntgSNPScore = int(modArgs[1]) 
                            del modArgs[:2] #the modifier arguments are deleted from the list
                        else:
                            debugLog.append('Error: Modifier argument not supplied\n')
                            errorExit() #exit the script
                    if len(modArgs) > 0 and modArgs[0] == 'r': #if there is at least one non-processed modifier argument
                        alignCntgs = False
                        del modArgs[0] #the modifier argument is deleted from the list
                    includeCntgs = True #include each contig on its own line
                elif letter == 'd': #if -d is supplied...
                    saveDebug = True #The debug log will be saved to the current working directory
                elif letter == 'a': #if -a is supplied...
                    containedCngts = True
                elif letter == 'e': #if -e is supplied...
                    saveEditLog = True #save log of edits made to the contig
                elif letter == 'i': #if -i is supplied...
                    IUPAC = True #condense all overlapping regions into IUPAC ambiguity codes
                elif letter == 'm': #if -m is supplied...
                    matchsOnly = True #only include matching regions of from the qual file
                elif letter == 'p': #if -p is supplied...
                    printOut = True #The results will be printed to the standard output (usually the screen)
                elif letter == 'b': #if -b is supplied...
                    verbose = True #Verbose output
                elif letter == 'S': #if -S is supplied...
                    silent = True #Verbose output
                elif letter == 'n': #if -n is supplied...
                    SNPs = True 
                elif letter == 'q': #if -q is supplied...
                    if len(modArgs) > 0: #if there is at least one non-processed modifier argument 
                        minQualScore = int(modArgs[0]) 
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
                elif letter == 's': #if -s is supplied...
                    if len(modArgs) > 0: #if there are at least two non-processed modifier arguments
                        minSNPScore = int(modArgs[0]) #The string specifying the desired lines to process from the input file
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
                elif letter == 't': #if -t is supplied...
                    if len(modArgs) > 0: #if there are at least two non-processed modifier arguments
                        trimNum = int(modArgs[0]) #The string specifying the desired lines to process from the input file
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
                elif letter == 'v': #if -v is supplied...
                    if len(modArgs) > 0: #if there are at least two non-processed modifier arguments
                        trimQual = int(modArgs[0]) #The string specifying the desired lines to process from the input file
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
                elif letter == 'l': #if -l is supplied...
                    if len(modArgs) > 0: #if there are at least two non-processed modifier arguments
                        minMatchLen = int(modArgs[0]) #The string specifying the desired lines to process from the input file
                        del modArgs[0] #the original argument is deleted from the list 
                    else: #if the list of modifier arguments is empty...
                        print 'Error: Modifier argument not supplied\n'
                        sys.exit() #exit the script
    if verbose:
        print 'qualtofa: Version: 2.4   Last Modified: 6/24/2010 by Zachary S. L. Foster' 
    #########################################################################################################################

    if verbose:
        print 'Parsing input file ' + qualPath + '.',

    ###Quality Value File Parsing############################################################################################
    qualHandle = open(qualPath, 'r') #opens the file containing the quality values
    qualRaw    = qualHandle.readlines() #saves all of the file into qualRaw
    qualHandle.close() #closes the file object
    qualName = qualRaw[2][:-1] #the name of the first contig is the 3rd line minus the newline
    qualVals = [] #will contain a list of quality values for each contig 
    qualData = [] #the data structure holding the parsed data 
    qualRaw.append('randomline') #place mark needed to process the last contig
    for line in qualRaw[3:]: #for every line past the first 4...
        splitLine = string.split(line) #make a list of all the parts of line separated by whitespace
        if len(splitLine) == 1 and splitLine[0].isdigit() == False: #if there is only one "word" (i.e. it is a name of a contig)
            qualData.append([qualName,qualVals]) #add the previous contig to the parsed data 
            qualName = splitLine[0] #new contig name 
            qualVals = [] 
        else: #if the line is a set of quality values...
            qualVals.append(splitLine) #append them to the list of quality values for this contig
    #########################################################################################################################

    if verbose:
        print '.',

    ###Reference Extraction##################################################################################################
    schema = string.split(qualRaw[1])
    if len(schema) > 0 and schema[1] == 'Reference':
        SQaligned = True
        reference = []
        for qual in qualData[0][1]:
            reference.append(qual[1])
    #########################################################################################################################

    if verbose:
        print '.',

    ###Contig Extraction####################################################################################################
    contigs = [] #will hold a list of contig names and their sequences (ex: [['name1',['A','C','G']],['name2',['G','C','A']],...])
    def addBase(cntgName,baseQuals):
        for cntgIndex in range(0,len(contigs)):
            if contigs[cntgIndex][0] == cntgName:
                contigs[cntgIndex][1].append(baseQuals)
                return
        if SQaligned:    
            contigs.append([cntgName,[baseQuals],refIndex])
        else:
            contigs.append([cntgName,[baseQuals]])
        
    if SQaligned:
        qualNum = 9
        refColNum = 3
    else:
        qualNum = 7
        refColNum = 0
    for cntgIndex in range(0,len(qualData)): #for every contig...
        for baseIndex in range(0,len(qualData[cntgIndex][1])): #for every base of every contig
            baseDepth = (len(qualData[cntgIndex][1][baseIndex]) - refColNum + 1) / qualNum #the number of distinct bases (quality value data) per index
            for depth in range(0,baseDepth): #for every quality value for every base in every contig
                startIndex = (depth * qualNum) + refColNum + 1 #the index of the character value for that quality value
                endIndex = startIndex + 5
                if SQaligned:
                    name = qualData[cntgIndex][1][baseIndex][startIndex - 2]
                    refIndex = qualData[cntgIndex][1][baseIndex][0]
                else:
                    name = qualData[cntgIndex][0]
                quals = qualData[cntgIndex][1][baseIndex][startIndex:endIndex + 2]
                addBase(name,quals) #add the character value to the list
    #########################################################################################################################

    ###Conitg Match Index addition#########################################################################################
    if SQaligned:
        rejects = [] 
        for cntgIndex in range(0,len(contigs)):
            inMatch = False
            cntgEnd = 0
            cntgStart = 0
            for baseIndex in range(0,len(contigs[cntgIndex][1])):
                if contigs[cntgIndex][1][baseIndex][0].isupper() and inMatch == False:
                    cntgStart = baseIndex + 1
                    inMatch = True
                elif contigs[cntgIndex][1][baseIndex][0].islower() and inMatch == True:
                    cntgEnd = baseIndex
                    break
            if cntgEnd == 0:
                cntgEnd = len(contigs[cntgIndex][1])
            refStart = int(contigs[cntgIndex][2]) + cntgStart - 1
            refEnd = int(contigs[cntgIndex][2]) + cntgEnd - 1
            del contigs[cntgIndex][2]
            if refEnd - refStart < minMatchLen:
                rejects.append(cntgIndex)
            else:
                contigs[cntgIndex].append([refStart,refEnd,cntgStart,cntgEnd])
        rejects.reverse()
        for index in rejects:
            del contigs[index]
        if verbose:
            print 'Complete'
            print '    ' + str(len(contigs)) + 'contigs will be included in the anaylsis.'
            if len(rejects) == 1:
                print '    1 contig was not included due to insufficient sequence length ( < ' + str(minMatchLen) + ' ).'
            elif len(rejects) > 1:
                print '    ' + str(len(rejects)) + ' contigs were not included due to insufficient sequence length ( < ' + str(minMatchLen) + ' ).'

        
    #########################################################################################################################
 
    ###Contig end trimming###################################################################################################
    def trim(cntgIndex,startT,endT):
        if startT + endT + minMatchLen >= (contigs[cntgIndex][2][1] - contigs[cntgIndex][2][0] + 1):
            del contigs[cntgIndex]
            return 1
        else:
            unMatchedStart = contigs[cntgIndex][2][2] - 1 #the number of bases before the match 
            unMatchedEnd   = len(contigs[cntgIndex][1]) - contigs[cntgIndex][2][3] #the number of bases after the match
            matchedStart   = startT  - unMatchedStart
            matchedEnd     = endT - unMatchedEnd
            if matchedStart > 0: #if the unmatched sequence is too small to accomodate the whole trim length
                contigs[cntgIndex][2][0] += matchedStart
                contigs[cntgIndex][2][2] = 1
                contigs[cntgIndex][2][3] -= startT #+ matchedStart
            else: #if the entire trim length can be taken out of the non-matching sequence
                contigs[cntgIndex][2][2] -= startT
                contigs[cntgIndex][2][3] -= startT                
            if matchedEnd > 0:
                contigs[cntgIndex][2][1] -= matchedEnd
                contigs[cntgIndex][2][3] -= matchedEnd
            del contigs[cntgIndex][1][:startT]
            if endT > 0:
                del contigs[cntgIndex][1][-endT:]
        return 0

    if trimNum > 0: #if -t modifier is supplied
        if verbose:
            print '-t Modifier: Contig ends are being trimed by ' + str(trimNum) + '...',
        delCount = 0
        for cntgIndex in range(-len(contigs) + 1,1):
            cntgIndex *= -1
            if trim(cntgIndex,trimNum,trimNum) == 1:
                delCount += 1
        if verbose:
            print 'Complete'
            if delCount == 1:
                print '    1 contig was removed due to insufficient sequence length ( < ' + str(minMatchLen) + ' ) after trimming.'
            elif delCount > 1:
                print '    ' + str(delCount) + ' contigs were removed due to insufficient sequence length ( < ' + str(minMatchLen) + ' ) after trimming.'
    if trimQual > 0: #if -v modifier is supplied
        delCount = 0
        if verbose:
            print '-v Modifier: Contig ends are being trimed until a base with a quality value of at least ' + str(trimQual) + ' is encountered...',
        for cntgIndex in range(-len(contigs) + 1,1):
            cntgIndex *= -1
            startTrim = 0
            endTrim = 0
            for baseIndex in range(0,len(contigs[cntgIndex][1])):
                if int(contigs[cntgIndex][1][baseIndex][-1]) < trimQual:
                    startTrim += 1
                else:
                    break
            for baseIndex in range(-len(contigs[cntgIndex][1]) + 1,1): #loops backwards
                baseIndex *= -1
                if int(contigs[cntgIndex][1][baseIndex][-1]) < trimQual:
                    endTrim += 1
                else:
                    break
            if trim(cntgIndex,startTrim,endTrim) == 1:
                delCount += 1
        if verbose:
            print 'Complete'
            if delCount == 1:
                print '    1 contig was removed due to insufficient sequence length ( < ' + str(minMatchLen) + ' ) after trimming.'
            elif delCount > 1:
                print '    ' + str(delCount) + ' contigs were removed due to insufficient sequence length ( < ' + str(minMatchLen) + ' ) after trimming.'
    #########################################################################################################################

    ###Non-matching contig end triming#######################################################################################
    if matchsOnly:
        if verbose:
            print '-m Modifier: Removing non-matching sequence...',
        if SQaligned:
            for cntgIndex in range(0,len(contigs)):
                del contigs[cntgIndex][1][contigs[cntgIndex][2][3]:]
                del contigs[cntgIndex][1][:contigs[cntgIndex][2][2] - 1]
                contigs[cntgIndex][2][3] -= contigs[cntgIndex][2][2] - 1
                contigs[cntgIndex][2][2] = 1
        else:
            debugLog.append('Error: the -m modifier must be used with an aligned quality file from sumqual.')
            errorExit() #exit the script
        if verbose:
            print 'Complete'
    #########################################################################################################################

    ###Contained contig extraction###########################################################################################
    contCntgSeqs = []
    if containedCngts == False and includeCntgs:
        if verbose:
            print 'Removing contianed contigs from the anaylsis...',
        contained = []
        contCntgs = []
        for cntgIndex in range(0,len(contigs)):
            for compIndex in range(0,len(contigs)):
                if contigs[compIndex][2][0] >= contigs[cntgIndex][2][0] and contigs[compIndex][2][1] <= contigs[cntgIndex][2][1] and compIndex != cntgIndex:
                    contained.append(compIndex)
        contained.reverse()
        for index in contained:
            contCntgs.append(contigs[index])
            del contigs[index]
        for cntg in contCntgs:
            contCntgSeqs.append('>' + cntg[0])
            outStr = ''
            for qual in cntg[1]:
                outStr += str(qual[0])
            contCntgSeqs.append(outStr)
        if verbose:
            print 'Complete'
            if len(contained) == 0:
                print '    No contained contigs found.' 
            elif len(contained) == 1:
                print '    1 contained contig found.' 
            else:
                print '    ' + str(len(contained)) + ' contained contigs found.'
    elif verbose:
        print '-a Modifier: Contained contigs will be included in the anaylsis.'
    #########################################################################################################################

    sortIndex = 0
    def sortComp(cntgA,cntgB):
        cntgAStart = cntgA[2][0] - cntgA[2][2]
        cntgBStart = cntgB[2][0] - cntgB[2][2]
        if cntgAStart > cntgBStart:
            return 1
        elif cntgAStart == cntgBStart:
            return 0
        else:
            return -1

    #########################################################################################################################
    #>Funtion used by the IUPAC function to remove repeated characters in list                                              #
    #>ex: ['A','C','C'] becomes ['A','C']                                                                                   #
    def removeRepeats(charList):
        for charIndex in range(0,len(charList)):
            for index in range(0,len(charList)):
                if charIndex != index and charList[charIndex] == charList[index]:
                    if charList[charIndex].islower(): 
                        del charList[charIndex]
                    else:
                        del charList[index]                        
                    return removeRepeats(charList)
        return charList
    #########################################################################################################################

    #########################################################################################################################
    #>Function that takes a list of base character values (ACTG-~N) and condenses them into a single IUPAC code             #
    #>Assumes only input characters to be ACTG-~N                                                                           #
    #>Ex: ['A','C'] becomes 'M'                                                                                             #
    def getIUPAC(baseValues):
        returnChar = ''
        x = removeRepeats(baseValues)
        if len(x) == 0:
            return '-'
        elif len(x) == 1:
            return x[0]
        for index in range(0,len(x)):
            if x[index] == '~':
                x[index] = '-'
        x.sort()
        for index in range(-len(x) + 1, 1):
            index *= -1
            if len(x) > 1 and x[index] == '-':
                del x[index]
        for index in range(-len(x) + 1, 1):
            index *= -1
            if len(x) > 1 and x[index].upper() == 'N':
                del x[index]
        if len(x) > 1:
            returnLower = True
            for char in x:
                if char.isupper():
                    returnLower = False
                    break
            if returnLower == False:
                for index in range(-len(x) + 1, 1):
                    index *= -1
                    if len(x) > 1 and x[index].islower():
                        del x[index]
            else:
                for index in range(0,len(x)):
                    x[index] = x[index].upper()
        if len(x) == 1:
            return x[0]
        elif len(x) == 2:
            if (x[0] == 'C' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'C'):
                returnChar = 'Y'
            elif (x[0] == 'A' and x[1] == 'G') or (x[0] == 'G' and x[1] == 'A'):
                returnChar = 'R'
            elif (x[0] == 'A' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'A'):
                returnChar = 'W'
            elif (x[0] == 'C' and x[1] == 'G') or (x[0] == 'G' and x[1] == 'C'):
                returnChar = 'S'
            elif (x[0] == 'G' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'G'):
                returnChar = 'K'
            elif (x[0] == 'C' and x[1] == 'A') or (x[0] == 'A' and x[1] == 'C'):
                returnChar = 'M'
        elif len(x) == 3:
            if x[0] != 'C' and x[1] != 'C' and x[2] != 'C':
                returnChar = 'D'
            elif x[0] != 'T' and x[1] != 'T' and x[2] != 'T':
                returnChar = 'V'
            elif x[0] != 'G' and x[1] != 'G' and x[2] != 'G':
                returnChar = 'H'
            elif x[0] != 'A' and x[1] != 'A' and x[2] != 'A':
                returnChar = 'B'
        else:
            returnChar = 'N'
        if returnLower:
            returnChar = returnChar.lower()
        return returnChar
    #########################################################################################################################
    SNPquals   = []
    alignedSeq = ''
    maskedSeq  = ''
    contigs.sort(sortComp)

    def getMasked(cntg,baseIndex,refIndex,minBaseVal,minSNPVal,minBaseProp,minSNPProp):
        char = cntg[1][baseIndex][0]
        qual = int(cntg[1][baseIndex][-1])
        if char != '-':
            ref = reference[refIndex].upper()
            if char.isupper() and char.upper() != ref.upper() and ref != '-': #is SNP
                if qual < minSNPVal:
                    char = maskingChar.upper()
                qual = [ref] + [refIndex + 1] + [cntg[0]] + [baseIndex + 1] + cntg[1][baseIndex]
                SNPquals.append(qual)
            elif minBaseVal > 0 and int(cntg[1][baseIndex][-1]) < minBaseVal:
                if cntg[1][baseIndex][0].isupper():
                    char = maskingChar.upper()
                else:
                    char = maskingChar.lower()
        return char

    ###IUPAC Alignment Generation############################################################################################
    if IUPAC:
        if SQaligned:
            if verbose:
                print '-i Modifier: Generating IUPAC-format consensous alignment...',
            for refIndex in range(0,len(reference)):
                maskedChars = []
                origChars = []
                for cntg in contigs:
                    start = cntg[2][0] - cntg[2][2]
                    end   = cntg[2][1] + (len(cntg[1]) - cntg[2][3]) - 1 
                    if refIndex >= start and refIndex <= end:
                        baseIndex = refIndex - start
                        maskedChars.append(getMasked(cntg,baseIndex,refIndex,minQualScore,minSNPScore,minBaseProp,minSNPProp))
                        origChars.append(cntg[1][baseIndex][0])
                maskedSeq += getIUPAC(maskedChars)
                alignedSeq += getIUPAC(origChars)
            outData.append(['Aligned_Contigs',alignedSeq])
            if minQualScore != 0 or minSNPScore != 0:
                outData.append(['Masked_Contigs',maskedSeq])
            if verbose:
                print 'Complete'
                
        else:
            debugLog.append('Error: the -i modifier must be used with an aligned quality file from sumqual.')
            errorExit() #exit the script
    #########################################################################################################################

    def contigsOverlap(index1,index2):
        cntg = contigs[index1]
        comp = contigs[index2]
        compStart = comp[2][0] - comp[2][2]
        cntgEnd   = cntg[2][1] + (len(cntg[1]) - cntg[2][3]) - 1
        out = cntgEnd - compStart + 1
        if out >= 0:
            return out
        else:
            return 0

    def matchsOverlap(index1,index2):
        compStart = contigs[index2][2][0]
        cntgEnd   = contigs[index1][2][1] 
        out = cntgEnd - compStart + 1
        if out >= 0:
            return out
        else:
            return 0

    ###Spaced alignment Generation###########################################################################################
    if IUPAC == False and SQaligned:
        if verbose:
            print 'Generating consensous alignment...',
        start = contigs[0][2][0] - contigs[0][2][2]
        if start > 1:
            for count in range(0,start):
                alignedSeq += '-'
                maskedSeq += '-'
        refDels = []
        for qualIndex in range(0,len(contigs[0][1])):
            alignedSeq += contigs[0][1][qualIndex][0]
            refIndex = contigs[0][2][0] - contigs[0][2][2] + qualIndex
            maskedSeq += getMasked(contigs[0],qualIndex,refIndex,minQualScore,minSNPScore)
        for cntgIndex in range(0,len(contigs) - 1):
            compIndex = cntgIndex + 1
            matchNum = matchsOverlap(cntgIndex,compIndex) #the amount of bases that the matching regions of the adjacent conigs overlap
            cntgNum = contigsOverlap(cntgIndex,compIndex)
            delNum = cntgNum  #= (contig length - end of ref match) + match overlap size 
            refInsert = int(contigs[cntgIndex][2][1]) #the index at which the deletion will be inserted
            for count in range(0,delNum):
                reference.insert(refInsert, '-')
            for cIndex in range(compIndex,len(contigs)):
                contigs[cIndex][2][0] += delNum #the contig with the larger starting match index must be moved up 
                contigs[cIndex][2][1] += delNum
            cntgEnd = contigs[cntgIndex][2][1] + (len(contigs[cntgIndex][1]) - contigs[cntgIndex][2][3]) - 1
            compStart = contigs[compIndex][2][0] - contigs[compIndex][2][2]
            if cntgEnd + 1 < compStart:
                count = compStart - cntgEnd - 1
                for x in range(0,count):
                    alignedSeq += '-'
                    maskedSeq += '-'
            for qualIndex in range(0,len(contigs[compIndex][1])):
                alignedSeq += contigs[compIndex][1][qualIndex][0]
                refIndex = contigs[compIndex][2][0] - contigs[compIndex][2][2] + qualIndex
                maskedSeq += getMasked(contigs[compIndex],qualIndex,refIndex,minQualScore,minSNPScore)
        outData.append(['Aligned_Contigs',alignedSeq])
        if minQualScore != 0 or minSNPScore != 0:
            outData.append(['Masked_Contigs',maskedSeq])
        if verbose:
            print 'Complete'

    #########################################################################################################################

    ###Contig Addition#######################################################################################################
    if includeCntgs:
        if verbose:
            print '-c Modifier: Extracting/Aligning Contigs...',
        for cntg in contigs:
            cntgSeq = ''
            cntgStart = cntg[2][0] - cntg[2][2] + 1
            if SQaligned and alignCntgs:
                for count in range(1,cntgStart):
                    cntgSeq += '-'
            qualIndex = 0
            for qual in cntg[1]:
                refIndex = cntg[2][0] - cntg[2][2] + qualIndex
                cntgSeq += getMasked(cntg,qualIndex,refIndex,cntgQualScore,cntgSNPScore)
                qualIndex += 1
            outData.append([cntg[0],cntgSeq])
        if verbose:
            print 'Complete'
    #########################################################################################################################
    refStr = ''
    for char in reference:
        refStr += char
    outData.insert(0,['Reference',refStr])

    ###Pointless spacer removal##############################################################################################
    if verbose:
        print 'Pointless index detection and removal...',
    delList = [] #will contain the indices at which all of the sequences have '-' and must be deleted 
    for refIndex in range(0,len(outData[0][1])): #loops through each index in terms of the reference 
        pointlessSpace = True #each indice is asumed to be a space 
        for cntgIndex in range(0,len(outData)): #loops through all the sequences that are being checked for spaces 
            seqLen = len(outData[cntgIndex][1]) 
            if refIndex < seqLen: #if the index being tested is present on this contig... 
                bp = outData[cntgIndex][1][refIndex]
                if bp != '-': #if the base is not a space...
                    pointlessSpace = False
        if pointlessSpace:
            delList.append(refIndex)
    if len(delList) > 0:
        for deletion in delList:
            for index in range(0,len(SNPquals)):
                if deletion < SNPquals[index][1]:
                    SNPquals[index][1] -= 1
        delList = mkRange(delList) #makes a list of indices into a list of index ranges
        delList.reverse() #reverses the order of the list to preserve the accuracy of the index values in delList as they are deleted from faData
        for theRange in delList: #for every range specifed by delList...
            for cntgIndex in range(0,len(outData)): #for every contig is faData...
                outData[cntgIndex][1] = outData[cntgIndex][1][:theRange[0]] + outData[cntgIndex][1][theRange[1] + 1:] #remove the section determined to be spaces
    if verbose:
        print 'Complete'
    #########################################################################################################################

    ###Out file writing and saveing procedures###############################################################################
    fileSavePath = savePath + os.path.basename(qualPath) + '.fsa' #the path to where the output is saved
    if verbose:
        print 'Generating output files...',
    outHandle = open(fileSavePath, 'w') #opens the file object for saving the output
    for seq in outData: #prints every line of outSeqs to the output file...
        outHandle.write('>' + seq[0] + '\n')
        outHandle.write(seq[1] + '\n')
    if printOut: #if the -p modifier is supplied...
        for seq in outData: #prints every line of outSeqs to the screen...
            print seq[0] 
            print seq[1]
    if len(contCntgSeqs) > 0:
        contCntgSavePath = savePath + os.path.basename(qualPath) + '_ContatiedContigs.fsa' #the path to where contained contigs are saved is saved
        CCHandle = open(contCntgSavePath, 'w')
        for line in contCntgSeqs:
            CCHandle.write(line + '\n')
        CCHandle.close()
    outHandle.close() #closes file object
    if SNPs:
        snpSavePath = savePath + os.path.basename(qualPath) + '_SNPs.qual' #the path to where SNPs are saved
        snpHandle = open(snpSavePath, 'w')
        for qual in SNPquals:
            line = ''
            for entry in qual:
                line += str(entry) + '\t'
            line = line[:-1]
            snpHandle.write(line + '\n')
        snpHandle.close()        
    if verbose:
        print 'Complete'
        print '    ' + str(len(outData)) + ' sequences saved to ' + fileSavePath + '.'
        if SNPs:
            print '    ' + str(len(SNPquals)) + ' SNPs saved to ' + snpSavePath + '.'
        if len(contCntgSeqs) > 0:
            if len(contained) == 1:
                print '    1 contained contig sequence saved to ' + contCntgSavePath + '.'
            elif len(contained) > 1:
                print '    ' + str(len(contained)) + ' contained contig sequences saved to ' + contCntgSavePath + '.'
    #########################################################################################################################

    ###Debug Saving procedures###############################################################################################
    if saveDebug: #if the -d modifier is supplied...
        debugHandle = open(savePath + os.path.basename(qualPath) + '_debug.txt', 'w')
        for line in debugLog:
            debugHandle.write(line)
        debugHandle.close()
    #########################################################################################################################
Qualtofa.py

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

research

Tools