Qualtofa.py
From OpenWetWare
Copy and past the following into a text editor and save it as qualtofa.py to use the script.
#!/usr/bin/env python
#############################################################################################################################
# qualtofa [options] <YASRA/sumqual quality file path> #
#---------------------------------------------------------------------------------------------------------------------------#
# Example: python qualtofa.py -i -c q 20 r -q 5 -s 20 ../myValues.qual #
#---------------------------------------------------------------------------------------------------------------------------#
# Version: 2.3 Last Modified: 4/19/2010 Author: Zachary S. L. Foster #
# Intended uses: #
# 1)Produces alignments in FASTA format from sumquals output. #
# 2)Extract the sequences of multiple contigs from a YASRA quality file. #
#---------------------------------------------------------------------------------------------------------------------------#
# Modifiers: #
# -c : Include each contig on its own line... (EX: -c q 20 r) #
# q : Set masking value for contigs (Analogous to -q modifier below; reference NOT needed). #
# s* : Set SNP masking value for contigs (Analogous to -s modifier below). #
# r* : Do NOT align to the reference using '-'s for spacers. #
# -d : Save debug log to current working directory. #
# -i* : Condense all overlapping regions into IUPAC ambiguity codes... #
# Default: Add deletions to the reference to accommodate the overlapping sequence. #
# -m* : Only include matching regions from the quality file. #
# -p : Print output to screen. #
# -q* : Set minimum masking value (a new masked sequence will be added to the output file). #
# -s* : Set minimum masking value for SNPs. #
# -a : Include contained contigs (defalt: save sequences of contiained contigs to a seperate file) #
# -t : Trim the ends of the contigs by the specified number of bases. #
# -v : Trim all the bases on either end of all contigs that have a quality value less than the specified amount. #
# -b : Verbose output; print progress reports and statistics. #
# -l : Set minimum match length #
# -S : Silent; Do not print anything to the standard output (aka screen) #
# -n : save SNPs to a .qual file
# #
# *Note - these modifiers rely on the reference included in the output of sumqual. If your quality #
# file is a list of unaligned contigs (e.g. from YASRA), these modifiers cannot be used. #
#############################################################################################################################
###Imports / Variable Initilization##########################################################################################
import os, string, sys, time, copy
defArgList = ['qualtofa.py','-b','-c','s','1','-c','r','-i','-n','-s','100','C:/Python26/AGCT.filtered.delta.qual']
#defArgList = ['qualtofa.py','-i','-q','20','-s','100','-c','-v','15','-b','-l','100','C:/Python26/out.delta.qual']
#defArgList = ['qualtofa.py','-v','50','-c','q','20','s','100','C:/Python26/out.delta.qual']
#defArgList = ['qualtofa.py','-t','10','-c','q','20','s','100','C:/Python26/out.delta.qual']
argList = sys.argv #argumen list supplied by user
argNum = len(argList) #the number of arguments supplied
minArgNum = 1 #the smallest amount of arguments with which it is possible to run the script
saveDebug = False #is True if the debug is to be saved
warning = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug log
printOut = False #is True when no the -p modifier is supplied; #The results will be printed to the standard output (usually the screen)
helpOnly = False #is 'True' when no arguments are given; only help menu is printed
includeCntgs= False #is True when the -c modifier is used
IUPAC = False #is True when the -i modifier is used
matchsOnly = False #is True when the -m modifier is used
alignCntgs = True #is False when the r argument of the -c modifier is used
SQaligned = False #if the input file has the sumqual output format
verbose = False #is True when the -b modifier is used
SNPs = False
silent = False
minMatchLen = 50 #Minimum match length accepted (after contig end trimming by -t and -v modifiers)
trimNum = 0 #number of bases removed from the ends of all the contigs
trimQual = 0 #minimum quality value needed to not be removed from the ends of the contigs
containedCngts = False #is True when the -a modifier is used
maskingChar = 'N' #What represents a "unknown" base
minQualScore= 0 #determined by the presence of the -q modifier
cntgQualScore= 0 #determined by the presence of the -c modifier, with argument q
cntgSNPScore= 0 #determined by the presence of the -c modifier, with argument s
minSNPScore = 0 #determined by the presence of the -s modifier
savePath = './' #the directory in which the output will be saved
defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE
modifiers = [] #eventually contains a list of all modifiers and their arguments supplied by the user
allMods = 'cdeimpqastvblSn' #all the modifiers recognized by the script
activeMods = '' #a string containing all modifiers specified by the user
modArgs = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers
qualData = []
refPath = ''
refData = []
outData = [] #eventually holds the data that will be saved in the output file
debugLog = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied
#############################################################################################################################
#Error handling function#####################################################################################################
#>Is called when the script encounters a fatal error #
#>Prints the debug log to standard out (usually the screen) #
def errorExit():
print 'The program was forced to exit prematurely, printing debug log...\n'
for line in debugLog:
print line
sys.exit()
#############################################################################################################################
def printHelp(): #is called if no arguments are supplied
print '/--------------------------------------------------------------------------------------------------\\'
print '| qualtofa [options] <YASRA/sumqual quality file path> |'
print '|--------------------------------------------------------------------------------------------------|'
print '| Example: python qualtofa.py -i -c q 20 r -q 5 -s 20 ../myValues.qual |'
print '|--------------------------------------------------------------------------------------------------|'
print '| verboseOut: 2.4 Last Modified: 6/24/2010 Author: Zachary S. L. Foster |'
print '| Intended uses: |'
print '| 1)Produces alignments in FASTA format from sumquals output. |'
print '| 2)Extract the sequences of multiple contigs from a YASRA quality file. |'
print '|--------------------------------------------------------------------------------------------------|'
print '| Modifiers: |'
print '| -c : Include each contig on its own line... (EX: -c q 20 r) |'
print '| q : Set masking value for contigs (Analogous to -q modifier below; reference NOT needed).|'
print '| s* : Set SNP masking value for contigs (Analogous to -s modifier below). |'
print '| r* : Align to the reference using dashs for spacers. |'
print '| -d : Save debug log to current working directory. |'
print '| -i* : Condense all overlapping regions into IUPAC ambiguity codes... |'
print '| Default: Add deletions to the reference to accommodate the overlapping sequence. |'
print '| -m* : Only include matching regions from the quality file. |'
print '| -p : Print output to screen. |'
print '| -q* : Set minimum masking value (a new masked sequence will be added to the output file). |'
print '| -s* : Set minimum masking value for SNPs. |'
print '| -a : Include contained contigs Defalt: save sequences of contiained contigs to a seperate file |'
print '| -t : Trim the ends of the contigs by the specified number of bases. |'
print '| -v : Trim all the bases on either end of all contigs that have a quality value less than |'
print '| the specified amount. |'
print '| -b : Verbose output; print progress reports and statistics. |'
print '| -l : Set minimum match length |'
print '| -S : Silent; Do not print anything to the standard output (aka screen) |'
print '| |'
print '| *Note - these modifiers rely on the reference included in the output of sumqual. If your quality |'
print '| file is a list of unaligned contigs (e.g. from YASRA), these modifiers cannot be used. |'
print '\\--------------------------------------------------------------------------------------------------/'
#############################################################################################################################
#>function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10) #
def mkRange(numList):
outList = []
start = numList[0]
current = numList[0]
numList.append(numList[-1])
for num in numList[1:]:
if (num - 1) != current:
outList.append([start,current])
start = num
current = num
return outList
#############################################################################################################################
###Argument Interprtation####################################################################################################
#>Parses the arguments supplied by the user, or the default values if the script is being run on IDLE #
#>If no arguments are given, the help menu is printed #
#>Modifiers and their arguments are isolated from the raw input for later processing (in Modifier Interpretation) #
#if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...
if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied...
if os.path.exists(argList[-1]) == 0: #if the path dose not exist
debugLog.append('Error: Invalid file path to input data')
errorExit() #end the program
elif argNum == 1: #If no arguments are supplied...
helpOnly = True
printHelp() #prints help menu
else:
debugLog.append('Error: Too few arguments supplied\n')
errorExit()
#else: #If the script is being imported on to IDLE
# argList = defArgList #use default arguments
# argNum = len(defArgList)
# if argNum == 1: #If no arguments are supplied...
# helpOnly = True
# printHelp() #prints help menu
# savePath = defSavePath #sets the save path to the default, specified in the variable initialization section
# debugLog.append('Alert: default debugging input arguments are being used\n')
if helpOnly == False: #if arguments were supplied...
qualPath = argList[-1] #the path to the qual file containing the input data
if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments)
modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their arguments
#############################################################################################################################
if helpOnly == False:
###Modifier Interpretation###############################################################################################
#>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation" #
#>Given arguments are compared against a list of known arguments #
#>Matches found change the appropriate variable for the desired effect of the modifier the script #
if len(modifiers) > 0: #if modifiers are supplied
for mod in modifiers: #loops through the list of modifiers and modifier arguments
if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters
activeMods += mod[1:] #sorts the modifiers into activeMods...
else:
modArgs.append(mod) #assumes everything else to be a modifier argument
for letter in activeMods: #checks if the modifiers are recognized
if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script
debugLog.append('Warning: Unexpected modifier: ' + letter + '\n')
warning = True #if the input modifier is not found
else:
if letter == 'c': #if -c is supplied...
if len(modArgs) > 0 and modArgs[0] == 'q': #if there is at least two non-processed modifier arguments
if len(modArgs) > 1:
cntgQualScore = int(modArgs[1])
del modArgs[:2] #the modifier arguments are deleted from the list
else:
debugLog.append('Error: Modifier argument not supplied\n')
errorExit() #exit the script
if len(modArgs) > 0 and modArgs[0] == 's': #if there is at least two non-processed modifier arguments
if len(modArgs) > 1:
cntgSNPScore = int(modArgs[1])
del modArgs[:2] #the modifier arguments are deleted from the list
else:
debugLog.append('Error: Modifier argument not supplied\n')
errorExit() #exit the script
if len(modArgs) > 0 and modArgs[0] == 'r': #if there is at least one non-processed modifier argument
alignCntgs = False
del modArgs[0] #the modifier argument is deleted from the list
includeCntgs = True #include each contig on its own line
elif letter == 'd': #if -d is supplied...
saveDebug = True #The debug log will be saved to the current working directory
elif letter == 'a': #if -a is supplied...
containedCngts = True
elif letter == 'e': #if -e is supplied...
saveEditLog = True #save log of edits made to the contig
elif letter == 'i': #if -i is supplied...
IUPAC = True #condense all overlapping regions into IUPAC ambiguity codes
elif letter == 'm': #if -m is supplied...
matchsOnly = True #only include matching regions of from the qual file
elif letter == 'p': #if -p is supplied...
printOut = True #The results will be printed to the standard output (usually the screen)
elif letter == 'b': #if -b is supplied...
verbose = True #Verbose output
elif letter == 'S': #if -S is supplied...
silent = True #Verbose output
elif letter == 'n': #if -n is supplied...
SNPs = True
elif letter == 'q': #if -q is supplied...
if len(modArgs) > 0: #if there is at least one non-processed modifier argument
minQualScore = int(modArgs[0])
del modArgs[0] #the original argument is deleted from the list
else: #if the list of modifier arguments is empty...
print 'Error: Modifier argument not supplied\n'
sys.exit() #exit the script
elif letter == 's': #if -s is supplied...
if len(modArgs) > 0: #if there are at least two non-processed modifier arguments
minSNPScore = int(modArgs[0]) #The string specifying the desired lines to process from the input file
del modArgs[0] #the original argument is deleted from the list
else: #if the list of modifier arguments is empty...
print 'Error: Modifier argument not supplied\n'
sys.exit() #exit the script
elif letter == 't': #if -t is supplied...
if len(modArgs) > 0: #if there are at least two non-processed modifier arguments
trimNum = int(modArgs[0]) #The string specifying the desired lines to process from the input file
del modArgs[0] #the original argument is deleted from the list
else: #if the list of modifier arguments is empty...
print 'Error: Modifier argument not supplied\n'
sys.exit() #exit the script
elif letter == 'v': #if -v is supplied...
if len(modArgs) > 0: #if there are at least two non-processed modifier arguments
trimQual = int(modArgs[0]) #The string specifying the desired lines to process from the input file
del modArgs[0] #the original argument is deleted from the list
else: #if the list of modifier arguments is empty...
print 'Error: Modifier argument not supplied\n'
sys.exit() #exit the script
elif letter == 'l': #if -l is supplied...
if len(modArgs) > 0: #if there are at least two non-processed modifier arguments
minMatchLen = int(modArgs[0]) #The string specifying the desired lines to process from the input file
del modArgs[0] #the original argument is deleted from the list
else: #if the list of modifier arguments is empty...
print 'Error: Modifier argument not supplied\n'
sys.exit() #exit the script
if verbose:
print 'qualtofa: Version: 2.4 Last Modified: 6/24/2010 by Zachary S. L. Foster'
#########################################################################################################################
if verbose:
print 'Parsing input file ' + qualPath + '.',
###Quality Value File Parsing############################################################################################
qualHandle = open(qualPath, 'r') #opens the file containing the quality values
qualRaw = qualHandle.readlines() #saves all of the file into qualRaw
qualHandle.close() #closes the file object
qualName = qualRaw[2][:-1] #the name of the first contig is the 3rd line minus the newline
qualVals = [] #will contain a list of quality values for each contig
qualData = [] #the data structure holding the parsed data
qualRaw.append('randomline') #place mark needed to process the last contig
for line in qualRaw[3:]: #for every line past the first 4...
splitLine = string.split(line) #make a list of all the parts of line separated by whitespace
if len(splitLine) == 1 and splitLine[0].isdigit() == False: #if there is only one "word" (i.e. it is a name of a contig)
qualData.append([qualName,qualVals]) #add the previous contig to the parsed data
qualName = splitLine[0] #new contig name
qualVals = []
else: #if the line is a set of quality values...
qualVals.append(splitLine) #append them to the list of quality values for this contig
#########################################################################################################################
if verbose:
print '.',
###Reference Extraction##################################################################################################
schema = string.split(qualRaw[1])
if len(schema) > 0 and schema[1] == 'Reference':
SQaligned = True
reference = []
for qual in qualData[0][1]:
reference.append(qual[1])
#########################################################################################################################
if verbose:
print '.',
###Contig Extraction####################################################################################################
contigs = [] #will hold a list of contig names and their sequences (ex: [['name1',['A','C','G']],['name2',['G','C','A']],...])
def addBase(cntgName,baseQuals):
for cntgIndex in range(0,len(contigs)):
if contigs[cntgIndex][0] == cntgName:
contigs[cntgIndex][1].append(baseQuals)
return
if SQaligned:
contigs.append([cntgName,[baseQuals],refIndex])
else:
contigs.append([cntgName,[baseQuals]])
if SQaligned:
qualNum = 9
refColNum = 3
else:
qualNum = 7
refColNum = 0
for cntgIndex in range(0,len(qualData)): #for every contig...
for baseIndex in range(0,len(qualData[cntgIndex][1])): #for every base of every contig
baseDepth = (len(qualData[cntgIndex][1][baseIndex]) - refColNum + 1) / qualNum #the number of distinct bases (quality value data) per index
for depth in range(0,baseDepth): #for every quality value for every base in every contig
startIndex = (depth * qualNum) + refColNum + 1 #the index of the character value for that quality value
endIndex = startIndex + 5
if SQaligned:
name = qualData[cntgIndex][1][baseIndex][startIndex - 2]
refIndex = qualData[cntgIndex][1][baseIndex][0]
else:
name = qualData[cntgIndex][0]
quals = qualData[cntgIndex][1][baseIndex][startIndex:endIndex + 2]
addBase(name,quals) #add the character value to the list
#########################################################################################################################
###Conitg Match Index addition#########################################################################################
if SQaligned:
rejects = []
for cntgIndex in range(0,len(contigs)):
inMatch = False
cntgEnd = 0
cntgStart = 0
for baseIndex in range(0,len(contigs[cntgIndex][1])):
if contigs[cntgIndex][1][baseIndex][0].isupper() and inMatch == False:
cntgStart = baseIndex + 1
inMatch = True
elif contigs[cntgIndex][1][baseIndex][0].islower() and inMatch == True:
cntgEnd = baseIndex
break
if cntgEnd == 0:
cntgEnd = len(contigs[cntgIndex][1])
refStart = int(contigs[cntgIndex][2]) + cntgStart - 1
refEnd = int(contigs[cntgIndex][2]) + cntgEnd - 1
del contigs[cntgIndex][2]
if refEnd - refStart < minMatchLen:
rejects.append(cntgIndex)
else:
contigs[cntgIndex].append([refStart,refEnd,cntgStart,cntgEnd])
rejects.reverse()
for index in rejects:
del contigs[index]
if verbose:
print 'Complete'
print ' ' + str(len(contigs)) + 'contigs will be included in the anaylsis.'
if len(rejects) == 1:
print ' 1 contig was not included due to insufficient sequence length ( < ' + str(minMatchLen) + ' ).'
elif len(rejects) > 1:
print ' ' + str(len(rejects)) + ' contigs were not included due to insufficient sequence length ( < ' + str(minMatchLen) + ' ).'
#########################################################################################################################
###Contig end trimming###################################################################################################
def trim(cntgIndex,startT,endT):
if startT + endT + minMatchLen >= (contigs[cntgIndex][2][1] - contigs[cntgIndex][2][0] + 1):
del contigs[cntgIndex]
return 1
else:
unMatchedStart = contigs[cntgIndex][2][2] - 1 #the number of bases before the match
unMatchedEnd = len(contigs[cntgIndex][1]) - contigs[cntgIndex][2][3] #the number of bases after the match
matchedStart = startT - unMatchedStart
matchedEnd = endT - unMatchedEnd
if matchedStart > 0: #if the unmatched sequence is too small to accomodate the whole trim length
contigs[cntgIndex][2][0] += matchedStart
contigs[cntgIndex][2][2] = 1
contigs[cntgIndex][2][3] -= startT #+ matchedStart
else: #if the entire trim length can be taken out of the non-matching sequence
contigs[cntgIndex][2][2] -= startT
contigs[cntgIndex][2][3] -= startT
if matchedEnd > 0:
contigs[cntgIndex][2][1] -= matchedEnd
contigs[cntgIndex][2][3] -= matchedEnd
del contigs[cntgIndex][1][:startT]
if endT > 0:
del contigs[cntgIndex][1][-endT:]
return 0
if trimNum > 0: #if -t modifier is supplied
if verbose:
print '-t Modifier: Contig ends are being trimed by ' + str(trimNum) + '...',
delCount = 0
for cntgIndex in range(-len(contigs) + 1,1):
cntgIndex *= -1
if trim(cntgIndex,trimNum,trimNum) == 1:
delCount += 1
if verbose:
print 'Complete'
if delCount == 1:
print ' 1 contig was removed due to insufficient sequence length ( < ' + str(minMatchLen) + ' ) after trimming.'
elif delCount > 1:
print ' ' + str(delCount) + ' contigs were removed due to insufficient sequence length ( < ' + str(minMatchLen) + ' ) after trimming.'
if trimQual > 0: #if -v modifier is supplied
delCount = 0
if verbose:
print '-v Modifier: Contig ends are being trimed until a base with a quality value of at least ' + str(trimQual) + ' is encountered...',
for cntgIndex in range(-len(contigs) + 1,1):
cntgIndex *= -1
startTrim = 0
endTrim = 0
for baseIndex in range(0,len(contigs[cntgIndex][1])):
if int(contigs[cntgIndex][1][baseIndex][-1]) < trimQual:
startTrim += 1
else:
break
for baseIndex in range(-len(contigs[cntgIndex][1]) + 1,1): #loops backwards
baseIndex *= -1
if int(contigs[cntgIndex][1][baseIndex][-1]) < trimQual:
endTrim += 1
else:
break
if trim(cntgIndex,startTrim,endTrim) == 1:
delCount += 1
if verbose:
print 'Complete'
if delCount == 1:
print ' 1 contig was removed due to insufficient sequence length ( < ' + str(minMatchLen) + ' ) after trimming.'
elif delCount > 1:
print ' ' + str(delCount) + ' contigs were removed due to insufficient sequence length ( < ' + str(minMatchLen) + ' ) after trimming.'
#########################################################################################################################
###Non-matching contig end triming#######################################################################################
if matchsOnly:
if verbose:
print '-m Modifier: Removing non-matching sequence...',
if SQaligned:
for cntgIndex in range(0,len(contigs)):
del contigs[cntgIndex][1][contigs[cntgIndex][2][3]:]
del contigs[cntgIndex][1][:contigs[cntgIndex][2][2] - 1]
contigs[cntgIndex][2][3] -= contigs[cntgIndex][2][2] - 1
contigs[cntgIndex][2][2] = 1
else:
debugLog.append('Error: the -m modifier must be used with an aligned quality file from sumqual.')
errorExit() #exit the script
if verbose:
print 'Complete'
#########################################################################################################################
###Contained contig extraction###########################################################################################
contCntgSeqs = []
if containedCngts == False and includeCntgs:
if verbose:
print 'Removing contianed contigs from the anaylsis...',
contained = []
contCntgs = []
for cntgIndex in range(0,len(contigs)):
for compIndex in range(0,len(contigs)):
if contigs[compIndex][2][0] >= contigs[cntgIndex][2][0] and contigs[compIndex][2][1] <= contigs[cntgIndex][2][1] and compIndex != cntgIndex:
contained.append(compIndex)
contained.reverse()
for index in contained:
contCntgs.append(contigs[index])
del contigs[index]
for cntg in contCntgs:
contCntgSeqs.append('>' + cntg[0])
outStr = ''
for qual in cntg[1]:
outStr += str(qual[0])
contCntgSeqs.append(outStr)
if verbose:
print 'Complete'
if len(contained) == 0:
print ' No contained contigs found.'
elif len(contained) == 1:
print ' 1 contained contig found.'
else:
print ' ' + str(len(contained)) + ' contained contigs found.'
elif verbose:
print '-a Modifier: Contained contigs will be included in the anaylsis.'
#########################################################################################################################
sortIndex = 0
def sortComp(cntgA,cntgB):
cntgAStart = cntgA[2][0] - cntgA[2][2]
cntgBStart = cntgB[2][0] - cntgB[2][2]
if cntgAStart > cntgBStart:
return 1
elif cntgAStart == cntgBStart:
return 0
else:
return -1
#########################################################################################################################
#>Funtion used by the IUPAC function to remove repeated characters in list #
#>ex: ['A','C','C'] becomes ['A','C'] #
def removeRepeats(charList):
for charIndex in range(0,len(charList)):
for index in range(0,len(charList)):
if charIndex != index and charList[charIndex] == charList[index]:
if charList[charIndex].islower():
del charList[charIndex]
else:
del charList[index]
return removeRepeats(charList)
return charList
#########################################################################################################################
#########################################################################################################################
#>Function that takes a list of base character values (ACTG-~N) and condenses them into a single IUPAC code #
#>Assumes only input characters to be ACTG-~N #
#>Ex: ['A','C'] becomes 'M' #
def getIUPAC(baseValues):
returnChar = ''
x = removeRepeats(baseValues)
if len(x) == 0:
return '-'
elif len(x) == 1:
return x[0]
for index in range(0,len(x)):
if x[index] == '~':
x[index] = '-'
x.sort()
for index in range(-len(x) + 1, 1):
index *= -1
if len(x) > 1 and x[index] == '-':
del x[index]
for index in range(-len(x) + 1, 1):
index *= -1
if len(x) > 1 and x[index].upper() == 'N':
del x[index]
if len(x) > 1:
returnLower = True
for char in x:
if char.isupper():
returnLower = False
break
if returnLower == False:
for index in range(-len(x) + 1, 1):
index *= -1
if len(x) > 1 and x[index].islower():
del x[index]
else:
for index in range(0,len(x)):
x[index] = x[index].upper()
if len(x) == 1:
return x[0]
elif len(x) == 2:
if (x[0] == 'C' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'C'):
returnChar = 'Y'
elif (x[0] == 'A' and x[1] == 'G') or (x[0] == 'G' and x[1] == 'A'):
returnChar = 'R'
elif (x[0] == 'A' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'A'):
returnChar = 'W'
elif (x[0] == 'C' and x[1] == 'G') or (x[0] == 'G' and x[1] == 'C'):
returnChar = 'S'
elif (x[0] == 'G' and x[1] == 'T') or (x[0] == 'T' and x[1] == 'G'):
returnChar = 'K'
elif (x[0] == 'C' and x[1] == 'A') or (x[0] == 'A' and x[1] == 'C'):
returnChar = 'M'
elif len(x) == 3:
if x[0] != 'C' and x[1] != 'C' and x[2] != 'C':
returnChar = 'D'
elif x[0] != 'T' and x[1] != 'T' and x[2] != 'T':
returnChar = 'V'
elif x[0] != 'G' and x[1] != 'G' and x[2] != 'G':
returnChar = 'H'
elif x[0] != 'A' and x[1] != 'A' and x[2] != 'A':
returnChar = 'B'
else:
returnChar = 'N'
if returnLower:
returnChar = returnChar.lower()
return returnChar
#########################################################################################################################
SNPquals = []
alignedSeq = ''
maskedSeq = ''
contigs.sort(sortComp)
def getMasked(cntg,baseIndex,refIndex,minBaseVal,minSNPVal,minBaseProp,minSNPProp):
char = cntg[1][baseIndex][0]
qual = int(cntg[1][baseIndex][-1])
if char != '-':
ref = reference[refIndex].upper()
if char.isupper() and char.upper() != ref.upper() and ref != '-': #is SNP
if qual < minSNPVal:
char = maskingChar.upper()
qual = [ref] + [refIndex + 1] + [cntg[0]] + [baseIndex + 1] + cntg[1][baseIndex]
SNPquals.append(qual)
elif minBaseVal > 0 and int(cntg[1][baseIndex][-1]) < minBaseVal:
if cntg[1][baseIndex][0].isupper():
char = maskingChar.upper()
else:
char = maskingChar.lower()
return char
###IUPAC Alignment Generation############################################################################################
if IUPAC:
if SQaligned:
if verbose:
print '-i Modifier: Generating IUPAC-format consensous alignment...',
for refIndex in range(0,len(reference)):
maskedChars = []
origChars = []
for cntg in contigs:
start = cntg[2][0] - cntg[2][2]
end = cntg[2][1] + (len(cntg[1]) - cntg[2][3]) - 1
if refIndex >= start and refIndex <= end:
baseIndex = refIndex - start
maskedChars.append(getMasked(cntg,baseIndex,refIndex,minQualScore,minSNPScore,minBaseProp,minSNPProp))
origChars.append(cntg[1][baseIndex][0])
maskedSeq += getIUPAC(maskedChars)
alignedSeq += getIUPAC(origChars)
outData.append(['Aligned_Contigs',alignedSeq])
if minQualScore != 0 or minSNPScore != 0:
outData.append(['Masked_Contigs',maskedSeq])
if verbose:
print 'Complete'
else:
debugLog.append('Error: the -i modifier must be used with an aligned quality file from sumqual.')
errorExit() #exit the script
#########################################################################################################################
def contigsOverlap(index1,index2):
cntg = contigs[index1]
comp = contigs[index2]
compStart = comp[2][0] - comp[2][2]
cntgEnd = cntg[2][1] + (len(cntg[1]) - cntg[2][3]) - 1
out = cntgEnd - compStart + 1
if out >= 0:
return out
else:
return 0
def matchsOverlap(index1,index2):
compStart = contigs[index2][2][0]
cntgEnd = contigs[index1][2][1]
out = cntgEnd - compStart + 1
if out >= 0:
return out
else:
return 0
###Spaced alignment Generation###########################################################################################
if IUPAC == False and SQaligned:
if verbose:
print 'Generating consensous alignment...',
start = contigs[0][2][0] - contigs[0][2][2]
if start > 1:
for count in range(0,start):
alignedSeq += '-'
maskedSeq += '-'
refDels = []
for qualIndex in range(0,len(contigs[0][1])):
alignedSeq += contigs[0][1][qualIndex][0]
refIndex = contigs[0][2][0] - contigs[0][2][2] + qualIndex
maskedSeq += getMasked(contigs[0],qualIndex,refIndex,minQualScore,minSNPScore)
for cntgIndex in range(0,len(contigs) - 1):
compIndex = cntgIndex + 1
matchNum = matchsOverlap(cntgIndex,compIndex) #the amount of bases that the matching regions of the adjacent conigs overlap
cntgNum = contigsOverlap(cntgIndex,compIndex)
delNum = cntgNum #= (contig length - end of ref match) + match overlap size
refInsert = int(contigs[cntgIndex][2][1]) #the index at which the deletion will be inserted
for count in range(0,delNum):
reference.insert(refInsert, '-')
for cIndex in range(compIndex,len(contigs)):
contigs[cIndex][2][0] += delNum #the contig with the larger starting match index must be moved up
contigs[cIndex][2][1] += delNum
cntgEnd = contigs[cntgIndex][2][1] + (len(contigs[cntgIndex][1]) - contigs[cntgIndex][2][3]) - 1
compStart = contigs[compIndex][2][0] - contigs[compIndex][2][2]
if cntgEnd + 1 < compStart:
count = compStart - cntgEnd - 1
for x in range(0,count):
alignedSeq += '-'
maskedSeq += '-'
for qualIndex in range(0,len(contigs[compIndex][1])):
alignedSeq += contigs[compIndex][1][qualIndex][0]
refIndex = contigs[compIndex][2][0] - contigs[compIndex][2][2] + qualIndex
maskedSeq += getMasked(contigs[compIndex],qualIndex,refIndex,minQualScore,minSNPScore)
outData.append(['Aligned_Contigs',alignedSeq])
if minQualScore != 0 or minSNPScore != 0:
outData.append(['Masked_Contigs',maskedSeq])
if verbose:
print 'Complete'
#########################################################################################################################
###Contig Addition#######################################################################################################
if includeCntgs:
if verbose:
print '-c Modifier: Extracting/Aligning Contigs...',
for cntg in contigs:
cntgSeq = ''
cntgStart = cntg[2][0] - cntg[2][2] + 1
if SQaligned and alignCntgs:
for count in range(1,cntgStart):
cntgSeq += '-'
qualIndex = 0
for qual in cntg[1]:
refIndex = cntg[2][0] - cntg[2][2] + qualIndex
cntgSeq += getMasked(cntg,qualIndex,refIndex,cntgQualScore,cntgSNPScore)
qualIndex += 1
outData.append([cntg[0],cntgSeq])
if verbose:
print 'Complete'
#########################################################################################################################
refStr = ''
for char in reference:
refStr += char
outData.insert(0,['Reference',refStr])
###Pointless spacer removal##############################################################################################
if verbose:
print 'Pointless index detection and removal...',
delList = [] #will contain the indices at which all of the sequences have '-' and must be deleted
for refIndex in range(0,len(outData[0][1])): #loops through each index in terms of the reference
pointlessSpace = True #each indice is asumed to be a space
for cntgIndex in range(0,len(outData)): #loops through all the sequences that are being checked for spaces
seqLen = len(outData[cntgIndex][1])
if refIndex < seqLen: #if the index being tested is present on this contig...
bp = outData[cntgIndex][1][refIndex]
if bp != '-': #if the base is not a space...
pointlessSpace = False
if pointlessSpace:
delList.append(refIndex)
if len(delList) > 0:
for deletion in delList:
for index in range(0,len(SNPquals)):
if deletion < SNPquals[index][1]:
SNPquals[index][1] -= 1
delList = mkRange(delList) #makes a list of indices into a list of index ranges
delList.reverse() #reverses the order of the list to preserve the accuracy of the index values in delList as they are deleted from faData
for theRange in delList: #for every range specifed by delList...
for cntgIndex in range(0,len(outData)): #for every contig is faData...
outData[cntgIndex][1] = outData[cntgIndex][1][:theRange[0]] + outData[cntgIndex][1][theRange[1] + 1:] #remove the section determined to be spaces
if verbose:
print 'Complete'
#########################################################################################################################
###Out file writing and saveing procedures###############################################################################
fileSavePath = savePath + os.path.basename(qualPath) + '.fsa' #the path to where the output is saved
if verbose:
print 'Generating output files...',
outHandle = open(fileSavePath, 'w') #opens the file object for saving the output
for seq in outData: #prints every line of outSeqs to the output file...
outHandle.write('>' + seq[0] + '\n')
outHandle.write(seq[1] + '\n')
if printOut: #if the -p modifier is supplied...
for seq in outData: #prints every line of outSeqs to the screen...
print seq[0]
print seq[1]
if len(contCntgSeqs) > 0:
contCntgSavePath = savePath + os.path.basename(qualPath) + '_ContatiedContigs.fsa' #the path to where contained contigs are saved is saved
CCHandle = open(contCntgSavePath, 'w')
for line in contCntgSeqs:
CCHandle.write(line + '\n')
CCHandle.close()
outHandle.close() #closes file object
if SNPs:
snpSavePath = savePath + os.path.basename(qualPath) + '_SNPs.qual' #the path to where SNPs are saved
snpHandle = open(snpSavePath, 'w')
for qual in SNPquals:
line = ''
for entry in qual:
line += str(entry) + '\t'
line = line[:-1]
snpHandle.write(line + '\n')
snpHandle.close()
if verbose:
print 'Complete'
print ' ' + str(len(outData)) + ' sequences saved to ' + fileSavePath + '.'
if SNPs:
print ' ' + str(len(SNPquals)) + ' SNPs saved to ' + snpSavePath + '.'
if len(contCntgSeqs) > 0:
if len(contained) == 1:
print ' 1 contained contig sequence saved to ' + contCntgSavePath + '.'
elif len(contained) > 1:
print ' ' + str(len(contained)) + ' contained contig sequences saved to ' + contCntgSavePath + '.'
#########################################################################################################################
###Debug Saving procedures###############################################################################################
if saveDebug: #if the -d modifier is supplied...
debugHandle = open(savePath + os.path.basename(qualPath) + '_debug.txt', 'w')
for line in debugLog:
debugHandle.write(line)
debugHandle.close()
#########################################################################################################################