User:Lindenb/Notebook/UMR915/20110510: Difference between revisions

Revision as of 08:12, 10 May 2011

20110427‎ Top 20110511

Integragen

Columns:

     1  Position.hg19
     2  chrom
     3  sample.ID
     4  rs.name
     5  hapmap_ref_other
     6  X1000Genome.obs
     7  X1000Genome.desc
     8  Freq.HTZ.ExomesV1
     9  Freq.Hom.ExomesV1
    10  A
    11  C
    12  G
    13  T
    14  modified_call
    15  total
    16  used
    17  score
    18  reference
    19  type
    20  Gene.name
    21  Gene.start
    22  Gene.end
    23  strand
    24  nbre.exon
    25  refseq
    26  typeannot
    27  type.pos
    28  index.cdna
    29  index.prot
    30  Taille.cdna
    31  Intron.start
    32  Intron.end
    33  codon.wild
    34  aa.wild
    35  codon.mut
    36  aa.mut
    37  cds.wild
    38  cds.mut
    39  prot.wild
    40  prot.mut
    41  mirna
    42  region.splice

script

#extract mutated exome
#remove rs
#only keep the 'SNP_het'
#remove the low qualities
#remove SNP_het*
#only the non-synonymous or stop
#remove DNA & prot sequences
#order by GENE
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\
awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F '	' '{if(index($19,"douteux")==0) print;}' |\
awk -F '	' '{if(index($19,"_het")!=0) print;}' |\
awk -F '	' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\
cut -d '	'	-f 1-27 |\
sort -t '	' -k20 > _jeter1.txt 

#join to self using key= "gene name"
#only keep if first mutation in same gene/chromosome and pos1< pos2
#keep some columns
join -t '	' -j 20 _jeter1.txt _jeter1.txt |\
awk -F '	' '{if($3==$29 && int($2) < int($28) ) print;}' |\
cut -d '	' -f 1,2,3,20,26,28,46,52 > _jeter2.txt


#extract wild exome
#keep chrom,position,gene
#order by gene
gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\
cut -d '	' -f 1,2,20 |\
sort -t '	' -k 3 > _jeter3.txt 

#join wild & mutated data by gene
#check wild sample has no mutation in the pair of mutated snps
#remove wild data
join -t '	' -1 1 -2 3  _jeter2.txt _jeter3.txt|\
awk -F '	' '{if($3==$10 && int($9) != int($2) && int($9) != int($6)) print;}' |\
cut -d '	' -f 1-8 > _jeter4.txt 

#extract gene names
cut -d '	' -f 1 _jeter4.txt | sort | uniq > _jeter5.txt 


#rm _jeter*.txt

User:Lindenb/Notebook/UMR915/20110510: Difference between revisions

Revision as of 08:12, 10 May 2011

Integragen

script

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

research

Tools