User:Alex S. Nord/Notebook/Computational Work/2011/10/19: Difference between revisions

Revision as of 12:30, 19 October 2011

Project name

<html><img src="/images/9/94/Report.png" border="0" /></html> Main project page

Mapping alignments to barcodes using command line BLAST

BLAST command line manual: http://www.ncbi.nlm.nih.gov/books/NBK1763/

BLAST command line dmg for Mac: ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.2.25+.dmg

UNIX command to convert sam to fasta using awk awk '{OFS="\t"; print ">"$1"\n"$10}' filename.sam > filename.fa

command line call to generate indexed reference files for mapping to: makeblastdb -in reference.fa -dbtype nucl -parse_seqids -out reference.fa -title "Reference_Title"

command line blastn call: blastn -task 'blastn-short' -query query.fa -db reference_index.fa -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq' -max_target_seqs 1 -reward 2 -gapopen 4 > output.txt

R script for summarizing data:

set filenames for inout and output files

blast.output.filename <- "~/Desktop/DD/m110923_025602_00123_c100181102555600000315046911131143_s2_p0.bas.h5_mapped_output.txt" output.filename <- "~/Desktop/DD/alignment_summary.txt"

read in blast alingment in tab delimited format

the.data <- read.table(blast.output.filename, as.is=T, header=T)

generate summary count table for sseqid column

count.table <- summary(as.factor(the.data[,2])) count.table <- cbind(BarcodeName=names(count.table), Count=as.numeric(count.table))

initiate variables for basic information about sseqid matches

med.length <- rep(NA, nrow(count.table)) max.length <- rep(NA, nrow(count.table)) percent.20 <- rep(NA, nrow(count.table)) barcode <- rep(NA, nrow(count.table))

for each sseqid, generate summary information

for (index in 1:nrow(count.table)) { temp.data <- which(the.data[,2]==count.table[index,1]) med.length[index] <- median(as.numeric(the.data[temp.data,"length"], na.rm=T)) max.length[index] <- max(as.numeric(the.data[temp.data,"length"]), na.rm=T) percent.20[index] <- length(which(as.numeric(the.data[temp.data,"length"])>=20))/length(temp.data) barcode[index] <- the.data[temp.data[(which(as.numeric(the.data[temp.data,"length"])==max.length[index]))[1]],"qseq"] }

generate final table and write output

count.table <- cbind(count.table, med.length, max.length, percent.20, barcode) write.table(count.table, output.filename, sep="\t", row.names=F, col.names=T)

User:Alex S. Nord/Notebook/Computational Work/2011/10/19: Difference between revisions

Revision as of 12:30, 19 October 2011

Mapping alignments to barcodes using command line BLAST

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

research

Tools

@@ Line 6: / Line 6: @@
 | colspan="2"|
 <!-- ##### DO NOT edit above this line unless you know what you are doing. ##### -->
-==Entry title==
+==Mapping alignments to barcodes using command line BLAST==
-* Insert content here...
+BLAST command line manual:
+http://www.ncbi.nlm.nih.gov/books/NBK1763/
+BLAST command line dmg for Mac:
+ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.2.25+.dmg
+UNIX command to convert sam to fasta using awk
+awk '{OFS="\t"; print ">"$1"\n"$10}' filename.sam > filename.fa
+command line call to generate indexed reference files for mapping to:
+makeblastdb -in reference.fa -dbtype nucl -parse_seqids -out reference.fa -title "Reference_Title"
+command line blastn call:
+blastn -task 'blastn-short' -query query.fa -db reference_index.fa -outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qseq sseq' -max_target_seqs 1 -reward 2 -gapopen 4 > output.txt
+R script for summarizing data:
+# set filenames for inout and output files
+blast.output.filename <- "~/Desktop/DD/m110923_025602_00123_c100181102555600000315046911131143_s2_p0.bas.h5_mapped_output.txt"
+output.filename <- "~/Desktop/DD/alignment_summary.txt"
+# read in blast alingment in tab delimited format
+the.data <- read.table(blast.output.filename, as.is=T, header=T)
+# generate summary count table for sseqid column
+count.table <- summary(as.factor(the.data[,2]))
+count.table <- cbind(BarcodeName=names(count.table), Count=as.numeric(count.table))
+# initiate variables for basic information about sseqid matches
+med.length <- rep(NA, nrow(count.table))
+max.length <- rep(NA, nrow(count.table))
+percent.20 <- rep(NA, nrow(count.table))
+barcode <- rep(NA, nrow(count.table))
+# for each sseqid, generate summary information
+for (index in 1:nrow(count.table)) {
+	temp.data <- which(the.data[,2]==count.table[index,1])
+	med.length[index] <- median(as.numeric(the.data[temp.data,"length"], na.rm=T))
+	max.length[index] <- max(as.numeric(the.data[temp.data,"length"]), na.rm=T)
+	percent.20[index] <- length(which(as.numeric(the.data[temp.data,"length"])>=20))/length(temp.data)
+	barcode[index] <- the.data[temp.data[(which(as.numeric(the.data[temp.data,"length"])==max.length[index]))[1]],"qseq"]
+}
+# generate final table and write output
+count.table <- cbind(count.table, med.length, max.length, percent.20, barcode)
+write.table(count.table, output.filename, sep="\t", row.names=F, col.names=T)
 <!-- ##### DO NOT edit below this line unless you know what you are doing. ##### -->