User:Lindenb/Notebook/UMR915/20101117: Difference between revisions

From OpenWetWare
Jump to navigationJump to search
(New page: {{PLNB|2010116|2010118}} =Integragen= creating custom tracks for Cedric =Test indexing the genome= test indexing the human genome with BDB JE: '''wordLength:9''' <pre>import java.io.Buf...)
 
Line 6: Line 6:
test indexing the human genome with BDB JE: '''wordLength:9'''  
test indexing the human genome with BDB JE: '''wordLength:9'''  


<pre>import java.io.BufferedReader;
<pre style="height:200px;overflow:auto;">import java.io.BufferedReader;
import java.io.File;
import java.io.File;
import java.io.FileReader;
import java.io.FileReader;

Revision as of 01:32, 18 November 2010

2010116        Top        2010118       


Integragen

creating custom tracks for Cedric

Test indexing the genome

test indexing the human genome with BDB JE: wordLength:9

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.List;

import com.sleepycat.bind.tuple.TupleBinding;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;

/*
 * javac -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar -sourcepath ~/workspace/SANDBOX/src -d classes  ~/workspace/SANDBOX/src/IndexTheGenome.java
 *  java -cp /usr/local/package/je-4.0.103/lib/je-4.0.103.jar:classes IndexTheGenome
 */
public class IndexTheGenome
	{
	private File directory;
	private int wordLength=9;
	private Environment environment;
	private Database prefix2locs;
	private int maxHit=10000;
	
	
	private static class LocIndex
		{
		byte seqIndex;
		int position;
		}
	
	private class LocIndexBinding
		extends TupleBinding<List<LocIndex>>
		{
		@Override
		public List<LocIndex> entryToObject(TupleInput in)
			{
			int n=in.readInt();
			List<LocIndex> list=new ArrayList<IndexTheGenome.LocIndex>(n);
			for(int i=0;i< n;++i)
				{
				LocIndex loc=new LocIndex();
				loc.seqIndex=in.readByte();
				loc.position=in.readInt();
				list.add(loc);
				}
			return list;
			}
		@Override
		public void objectToEntry(List<LocIndex> list, TupleOutput out)
			{
			out.writeInt(list.size());
			for(LocIndex loc:list)
				{
				out.writeByte(loc.seqIndex);
				out.writeInt(loc.position);
				}
			}
		}
	
	IndexTheGenome()
		{
		
		}
	private void open() throws Exception
		{
		close();
		EnvironmentConfig envCfg=new EnvironmentConfig();
		envCfg.setAllowCreate(true);
		envCfg.setReadOnly(false);
		envCfg.setTransactional(false);
		
		envCfg.setConfigParam(EnvironmentConfig.LOG_FILE_MAX,"250000000");
		this.environment=new Environment(this.directory, envCfg);
		DatabaseConfig cfg=new DatabaseConfig();
		cfg.setAllowCreate(true);
		cfg.setReadOnly(false);
		cfg.setTransactional(false);
		cfg.setDeferredWrite(true);
		this.prefix2locs=this.environment.openDatabase(null, "prefix2locs", cfg);
		}
	private void close() throws Exception
		{
		try {
			if(this.prefix2locs!=null)
				{
				this.prefix2locs.close();
				}
			} 
		catch (Exception e)
			{
			this.prefix2locs=null;
			}
		
		try {
			if(this.environment!=null)
				{
				this.environment.cleanLog();
				this.environment.close();
				}
			} 
		catch (Exception e)
			{
			this.environment=null;
			}
		}
	
	private void doIndex(File file) throws Exception
		{
		int countOverflows=0;
		int countKeys=0;
		LocIndexBinding binding=new LocIndexBinding();
		DatabaseEntry key=new DatabaseEntry();
		DatabaseEntry value=new DatabaseEntry();
		byte array[]=new byte[this.wordLength];
		int arraySize=0;
		byte seqIndex=-1;
		int genome=0;
		String line;
		List<LocIndex> locs=null;
		System.err.println("Indexing "+file);
		long now=System.currentTimeMillis();
		BufferedReader in=new BufferedReader(new FileReader(file));
		while((line=in.readLine())!=null)
			{
			if(line.startsWith(">"))
				{
				this.prefix2locs.sync();
				this.environment.cleanLog();
				System.err.println("Found "+line +" (overflows: "+countOverflows+" keys:"+countKeys+" time="+ (System.currentTimeMillis()-now)/(1000*60)+"mins)");
				++seqIndex;
				arraySize=0;
				genome=0;
				}
			else
				{
				for(int i=0;i< line.length();++i)
					{
					char c=Character.toUpperCase(line.charAt(i));
					if(Character.isWhitespace(c)) continue;
					if(c=='A' || c=='T' || c=='G' || c=='C')
						{
						array[arraySize++]=(byte)c;
						if(arraySize==array.length)
							{
							key.setData(array);
							if(this.prefix2locs.get(null, key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS)
								{
								locs=binding.entryToObject(value);
								}
							else
								{
								locs=new ArrayList<IndexTheGenome.LocIndex>(1);
								countKeys++;
								}
							
							if(locs.size()<this.maxHit)
								{
								LocIndex index=new LocIndex();
								index.seqIndex=seqIndex;
								index.position=genome;
								locs.add(index);
								binding.objectToEntry(locs, value);
								this.prefix2locs.put(null, key, value);
								}
							else
								{
								++countOverflows;
								}
							arraySize=0;
							}
						}
					else
						{
						arraySize=0;
						}
					genome++;
					}
				}
			}
		in.close();
		System.err.println("overflows: "+countOverflows+" keys: "+countKeys+" time="+ (System.currentTimeMillis()-now)/(1000*60)+"mins");
		
		key=new DatabaseEntry();
		Cursor c=this.prefix2locs.openCursor(null, null);
		while(c.getNext(key, value, LockMode.DEFAULT)==OperationStatus.SUCCESS)
			{
			locs=binding.entryToObject(value);
			if(locs.size()>=this.maxHit) c.delete();
			}
		c.close();
		}
	
	public static void main(String[] args)
		{
		try {
			String program="undefined";
			File fastaDir=null;
			IndexTheGenome app=new IndexTheGenome();
			int optind=0;
			while(optind<args.length)
				{
				if(args[optind].equals("-h"))
					{
					return;
					}
				else if(args[optind].equals("-d"))
					{
					app.directory=new File(args[++optind]);
					}
				else if(args[optind].equals("-f"))
					{
					fastaDir=new File(args[++optind]);
					}
				else if(args[optind].equals("-w"))
					{
					app.wordLength=Integer.parseInt(args[++optind]);
					}
				else if(args[optind].equals("-p"))
					{
					program=args[++optind];
					}
				else if(args[optind].equals("--"))
					{
					optind++;
					break;
					}
				else if(args[optind].startsWith("-"))
					{
					System.err.println("Unnown option: "+args[optind]);
					return;
					}
				else
					{
					break;
					}
				++optind;
				}
			if(app.directory==null)
				{
				System.err.println("Dir missing");
				return ;
				}
			app.open();
			if(program.equals("index"))
				{
				if(fastaDir==null)
					{
					System.err.println("FastaDir missing");
					return ;
					}
				
				for(File fasta:fastaDir.listFiles(new FilenameFilter()
					{
					@Override
					public boolean accept(File base, String s)
						{
						return s.endsWith(".fa");
						}}))
					{
					app.doIndex(fasta);
					}
				}
			else
				{
				System.err.println("undefined "+program);
				}
			
			app.close();
			}
		catch (Exception e)
			{
			e.printStackTrace();
			}
		}
	}