User:Alex S. Nord/Notebook/Computational Work/2011/10/19: Difference between revisions
Alex S. Nord (talk | contribs) (Autocreate 2011/10/19 Entry for User:Alex_S._Nord/Notebook/Computational_Work) |
Alex S. Nord (talk | contribs) |
||
Line 6: | Line 6: | ||
| colspan="2"| | | colspan="2"| | ||
<!-- ##### DO NOT edit above this line unless you know what you are doing. ##### --> | <!-- ##### DO NOT edit above this line unless you know what you are doing. ##### --> | ||
== | ==Mapping alignments to barcodes using command line BLAST== | ||
BLAST command line manual: | |||
http://www.ncbi.nlm.nih.gov/books/NBK1763/ | |||
BLAST command line dmg for Mac: | |||
ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.2.25+.dmg | |||
UNIX command to convert sam to fasta using awk | |||
awk '{OFS="\t"; print ">"$1"\n"$10}' filename.sam > filename.fa | |||
command line call to generate indexed reference files for mapping to: | |||
makeblastdb -in reference.fa -dbtype nucl -parse_seqids -out reference.fa -title "Reference_Title" | |||
command line blastn call: | |||
blastn -task 'blastn-short' -query query.fa -db reference_index.fa -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq' -max_target_seqs 1 -reward 2 -gapopen 4 > output.txt | |||
R script for summarizing data: | |||
# set filenames for inout and output files | |||
blast.output.filename <- "~/Desktop/DD/m110923_025602_00123_c100181102555600000315046911131143_s2_p0.bas.h5_mapped_output.txt" | |||
output.filename <- "~/Desktop/DD/alignment_summary.txt" | |||
# read in blast alingment in tab delimited format | |||
the.data <- read.table(blast.output.filename, as.is=T, header=T) | |||
# generate summary count table for sseqid column | |||
count.table <- summary(as.factor(the.data[,2])) | |||
count.table <- cbind(BarcodeName=names(count.table), Count=as.numeric(count.table)) | |||
# initiate variables for basic information about sseqid matches | |||
med.length <- rep(NA, nrow(count.table)) | |||
max.length <- rep(NA, nrow(count.table)) | |||
percent.20 <- rep(NA, nrow(count.table)) | |||
barcode <- rep(NA, nrow(count.table)) | |||
# for each sseqid, generate summary information | |||
for (index in 1:nrow(count.table)) { | |||
temp.data <- which(the.data[,2]==count.table[index,1]) | |||
med.length[index] <- median(as.numeric(the.data[temp.data,"length"], na.rm=T)) | |||
max.length[index] <- max(as.numeric(the.data[temp.data,"length"]), na.rm=T) | |||
percent.20[index] <- length(which(as.numeric(the.data[temp.data,"length"])>=20))/length(temp.data) | |||
barcode[index] <- the.data[temp.data[(which(as.numeric(the.data[temp.data,"length"])==max.length[index]))[1]],"qseq"] | |||
} | |||
# generate final table and write output | |||
count.table <- cbind(count.table, med.length, max.length, percent.20, barcode) | |||
write.table(count.table, output.filename, sep="\t", row.names=F, col.names=T) | |||
<!-- ##### DO NOT edit below this line unless you know what you are doing. ##### --> | <!-- ##### DO NOT edit below this line unless you know what you are doing. ##### --> |
Revision as of 12:30, 19 October 2011
Project name | <html><img src="/images/9/94/Report.png" border="0" /></html> Main project page |
Mapping alignments to barcodes using command line BLASTBLAST command line manual: http://www.ncbi.nlm.nih.gov/books/NBK1763/ BLAST command line dmg for Mac: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.2.25+.dmg UNIX command to convert sam to fasta using awk awk '{OFS="\t"; print ">"$1"\n"$10}' filename.sam > filename.fa command line call to generate indexed reference files for mapping to: makeblastdb -in reference.fa -dbtype nucl -parse_seqids -out reference.fa -title "Reference_Title" command line blastn call: blastn -task 'blastn-short' -query query.fa -db reference_index.fa -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq' -max_target_seqs 1 -reward 2 -gapopen 4 > output.txt R script for summarizing data:
blast.output.filename <- "~/Desktop/DD/m110923_025602_00123_c100181102555600000315046911131143_s2_p0.bas.h5_mapped_output.txt" output.filename <- "~/Desktop/DD/alignment_summary.txt"
the.data <- read.table(blast.output.filename, as.is=T, header=T)
count.table <- summary(as.factor(the.data[,2])) count.table <- cbind(BarcodeName=names(count.table), Count=as.numeric(count.table))
med.length <- rep(NA, nrow(count.table)) max.length <- rep(NA, nrow(count.table)) percent.20 <- rep(NA, nrow(count.table)) barcode <- rep(NA, nrow(count.table))
for (index in 1:nrow(count.table)) { temp.data <- which(the.data[,2]==count.table[index,1]) med.length[index] <- median(as.numeric(the.data[temp.data,"length"], na.rm=T)) max.length[index] <- max(as.numeric(the.data[temp.data,"length"]), na.rm=T) percent.20[index] <- length(which(as.numeric(the.data[temp.data,"length"])>=20))/length(temp.data) barcode[index] <- the.data[temp.data[(which(as.numeric(the.data[temp.data,"length"])==max.length[index]))[1]],"qseq"] }
count.table <- cbind(count.table, med.length, max.length, percent.20, barcode) write.table(count.table, output.filename, sep="\t", row.names=F, col.names=T) |