User:Lindenb/Notebook/UMR915/20110510: Difference between revisions
From OpenWetWare
No edit summary |
No edit summary |
||
Line 43: | Line 43: | ||
41 mirna | 41 mirna | ||
42 region.splice</pre> | 42 region.splice</pre> | ||
==script== | |||
<pre>#extract mutated exome | |||
#remove rs | |||
#only keep the 'SNP_het' | |||
#remove the low qualities | |||
#remove SNP_het* | |||
#only the non-synonymous or stop | |||
#remove DNA & prot sequences | |||
#order by GENE | |||
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\ | |||
awk -F ' ' '{if(substr($4,1,2)!="rs") print;}' |\ | |||
awk -F ' ' '{if(index($19,"douteux")==0) print;}' |\ | |||
awk -F ' ' '{if(index($19,"_het")!=0) print;}' |\ | |||
awk -F ' ' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\ | |||
cut -d ' ' -f 1-27 |\ | |||
sort -t ' ' -k20 > _jeter1.txt | |||
#join to self using key= "gene name" | |||
#only keep if first mutation in same gene/chromosome and pos1< pos2 | |||
#keep some columns | |||
join -t ' ' -j 20 _jeter1.txt _jeter1.txt |\ | |||
awk -F ' ' '{if($3==$29 && int($2) < int($28) ) print;}' |\ | |||
cut -d ' ' -f 1,2,3,20,26,28,46,52 > _jeter2.txt | |||
#extract wild exome | |||
#keep chrom,position,gene | |||
#order by gene | |||
gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\ | |||
cut -d ' ' -f 1,2,20 |\ | |||
sort -t ' ' -k 3 > _jeter3.txt | |||
#join wild & mutated data by gene | |||
#check wild sample has no mutation in the pair of mutated snps | |||
#remove wild data | |||
join -t ' ' -1 1 -2 3 _jeter2.txt _jeter3.txt|\ | |||
awk -F ' ' '{if($3==$10 && int($9) != int($2) && int($9) != int($6)) print;}' |\ | |||
cut -d ' ' -f 1-8 > _jeter4.txt | |||
#extract gene names | |||
cut -d ' ' -f 1 _jeter4.txt | sort | uniq > _jeter5.txt | |||
#rm _jeter*.txt</pre> |
Revision as of 08:12, 10 May 2011
Integragen
Columns:
1 Position.hg19 2 chrom 3 sample.ID 4 rs.name 5 hapmap_ref_other 6 X1000Genome.obs 7 X1000Genome.desc 8 Freq.HTZ.ExomesV1 9 Freq.Hom.ExomesV1 10 A 11 C 12 G 13 T 14 modified_call 15 total 16 used 17 score 18 reference 19 type 20 Gene.name 21 Gene.start 22 Gene.end 23 strand 24 nbre.exon 25 refseq 26 typeannot 27 type.pos 28 index.cdna 29 index.prot 30 Taille.cdna 31 Intron.start 32 Intron.end 33 codon.wild 34 aa.wild 35 codon.mut 36 aa.mut 37 cds.wild 38 cds.mut 39 prot.wild 40 prot.mut 41 mirna 42 region.splice
script
#extract mutated exome #remove rs #only keep the 'SNP_het' #remove the low qualities #remove SNP_het* #only the non-synonymous or stop #remove DNA & prot sequences #order by GENE gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\ awk -F ' ' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F ' ' '{if(index($19,"douteux")==0) print;}' |\ awk -F ' ' '{if(index($19,"_het")!=0) print;}' |\ awk -F ' ' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\ cut -d ' ' -f 1-27 |\ sort -t ' ' -k20 > _jeter1.txt #join to self using key= "gene name" #only keep if first mutation in same gene/chromosome and pos1< pos2 #keep some columns join -t ' ' -j 20 _jeter1.txt _jeter1.txt |\ awk -F ' ' '{if($3==$29 && int($2) < int($28) ) print;}' |\ cut -d ' ' -f 1,2,3,20,26,28,46,52 > _jeter2.txt #extract wild exome #keep chrom,position,gene #order by gene gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\ cut -d ' ' -f 1,2,20 |\ sort -t ' ' -k 3 > _jeter3.txt #join wild & mutated data by gene #check wild sample has no mutation in the pair of mutated snps #remove wild data join -t ' ' -1 1 -2 3 _jeter2.txt _jeter3.txt|\ awk -F ' ' '{if($3==$10 && int($9) != int($2) && int($9) != int($6)) print;}' |\ cut -d ' ' -f 1-8 > _jeter4.txt #extract gene names cut -d ' ' -f 1 _jeter4.txt | sort | uniq > _jeter5.txt #rm _jeter*.txt