User:Lindenb/Notebook/UMR915/20101117: Difference between revisions
From OpenWetWare
(New page: {{PLNB|2010116|2010118}} =Integragen= creating custom tracks for Cedric =Test indexing the genome= test indexing the human genome with BDB JE: '''wordLength:9''' <pre>import java.io.Buf...) |
|||
Line 6: | Line 6: | ||
test indexing the human genome with BDB JE: '''wordLength:9''' | test indexing the human genome with BDB JE: '''wordLength:9''' | ||
<pre>import java.io.BufferedReader; | <pre style="height:200px;overflow:auto;">import java.io.BufferedReader; | ||
import java.io.File; | import java.io.File; | ||
import java.io.FileReader; | import java.io.FileReader; |
Revision as of 01:32, 18 November 2010
Integragen
creating custom tracks for Cedric
Test indexing the genome
test indexing the human genome with BDB JE: wordLength:9
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FilenameFilter; import java.util.ArrayList; import java.util.List; import com.sleepycat.bind.tuple.TupleBinding; import com.sleepycat.bind.tuple.TupleInput; import com.sleepycat.bind.tuple.TupleOutput; import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseConfig; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.Environment; import com.sleepycat.je.EnvironmentConfig; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus; /* * javac -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar -sourcepath ~/workspace/SANDBOX/src -d classes ~/workspace/SANDBOX/src/IndexTheGenome.java * java -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar:classes IndexTheGenome */ public class IndexTheGenome { private File directory; private int wordLength=9; private Environment environment; private Database prefix2locs; private int maxHit=10000; private static class LocIndex { byte seqIndex; int position; } private class LocIndexBinding extends TupleBinding<List<LocIndex>> { @Override public List<LocIndex> entryToObject(TupleInput in) { int n=in.readInt(); List<LocIndex> list=new ArrayList<IndexTheGenome.LocIndex>(n); for(int i=0;i< n;++i) { LocIndex loc=new LocIndex(); loc.seqIndex=in.readByte(); loc.position=in.readInt(); list.add(loc); } return list; } @Override public void objectToEntry(List<LocIndex> list, TupleOutput out) { out.writeInt(list.size()); for(LocIndex loc:list) { out.writeByte(loc.seqIndex); out.writeInt(loc.position); } } } IndexTheGenome() { } private void open() throws Exception { close(); EnvironmentConfig envCfg=new EnvironmentConfig(); envCfg.setAllowCreate(true); envCfg.setReadOnly(false); envCfg.setTransactional(false); envCfg.setConfigParam(EnvironmentConfig.LOG_FILE_MAX,"250000000"); this.environment=new Environment(this.directory, envCfg); DatabaseConfig cfg=new DatabaseConfig(); cfg.setAllowCreate(true); cfg.setReadOnly(false); cfg.setTransactional(false); cfg.setDeferredWrite(true); this.prefix2locs=this.environment.openDatabase(null, "prefix2locs", cfg); } private void close() throws Exception { try { if(this.prefix2locs!=null) { this.prefix2locs.close(); } } catch (Exception e) { this.prefix2locs=null; } try { if(this.environment!=null) { this.environment.cleanLog(); this.environment.close(); } } catch (Exception e) { this.environment=null; } } private void doIndex(File file) throws Exception { int countOverflows=0; int countKeys=0; LocIndexBinding binding=new LocIndexBinding(); DatabaseEntry key=new DatabaseEntry(); DatabaseEntry value=new DatabaseEntry(); byte array[]=new byte[this.wordLength]; int arraySize=0; byte seqIndex=-1; int genome=0; String line; List<LocIndex> locs=null; System.err.println("Indexing "+file); long now=System.currentTimeMillis(); BufferedReader in=new BufferedReader(new FileReader(file)); while((line=in.readLine())!=null) { if(line.startsWith(">")) { this.prefix2locs.sync(); this.environment.cleanLog(); System.err.println("Found "+line +" (overflows: "+countOverflows+" keys:"+countKeys+" time="+ (System.currentTimeMillis()-now)/(1000*60)+"mins)"); ++seqIndex; arraySize=0; genome=0; } else { for(int i=0;i< line.length();++i) { char c=Character.toUpperCase(line.charAt(i)); if(Character.isWhitespace(c)) continue; if(c=='A' || c=='T' || c=='G' || c=='C') { array[arraySize++]=(byte)c; if(arraySize==array.length) { key.setData(array); if(this.prefix2locs.get(null, key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS) { locs=binding.entryToObject(value); } else { locs=new ArrayList<IndexTheGenome.LocIndex>(1); countKeys++; } if(locs.size()<this.maxHit) { LocIndex index=new LocIndex(); index.seqIndex=seqIndex; index.position=genome; locs.add(index); binding.objectToEntry(locs, value); this.prefix2locs.put(null, key, value); } else { ++countOverflows; } arraySize=0; } } else { arraySize=0; } genome++; } } } in.close(); System.err.println("overflows: "+countOverflows+" keys: "+countKeys+" time="+ (System.currentTimeMillis()-now)/(1000*60)+"mins"); key=new DatabaseEntry(); Cursor c=this.prefix2locs.openCursor(null, null); while(c.getNext(key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS) { locs=binding.entryToObject(value); if(locs.size()>=this.maxHit) c.delete(); } c.close(); } public static void main(String[] args) { try { String program="undefined"; File fastaDir=null; IndexTheGenome app=new IndexTheGenome(); int optind=0; while(optind<args.length) { if(args[optind].equals("-h")) { return; } else if(args[optind].equals("-d")) { app.directory=new File(args[++optind]); } else if(args[optind].equals("-f")) { fastaDir=new File(args[++optind]); } else if(args[optind].equals("-w")) { app.wordLength=Integer.parseInt(args[++optind]); } else if(args[optind].equals("-p")) { program=args[++optind]; } else if(args[optind].equals("--")) { optind++; break; } else if(args[optind].startsWith("-")) { System.err.println("Unnown option: "+args[optind]); return; } else { break; } ++optind; } if(app.directory==null) { System.err.println("Dir missing"); return ; } app.open(); if(program.equals("index")) { if(fastaDir==null) { System.err.println("FastaDir missing"); return ; } for(File fasta:fastaDir.listFiles(new FilenameFilter() { @Override public boolean accept(File base, String s) { return s.endsWith(".fa"); }})) { app.doIndex(fasta); } } else { System.err.println("undefined "+program); } app.close(); } catch (Exception e) { e.printStackTrace(); } } }