Wilke:Information theory: Difference between revisions

From OpenWetWare
Jump to navigationJump to search
(New page: {{WilkeMenu}} Concepts from [http://en.wikipedia.org/wiki/Information_theory information theory], such as entropy and mutual information, can provide useful insight in bioinformatics appl...)
 
No edit summary
Line 6: Line 6:
#!/usr/bin/python
#!/usr/bin/python


import math
import math # for log function
import scipy.special.basic # for polygamma function, needed only by Entropy_H_psi(z).


EulerGamma = 0.57721566490153286060
# for polygamma function, needed only by Entropy_H_psi(z).
from scipy.special.basic import *
 
EulerGamma = 0.57721566490153286060 # Euler Gamma constant


def Gi( n ):
def Gi( n ):
'''Helper function needed for entropy estimation. Defined by Grassberger 2003. http://arxiv.org/abs/physics/0307138
'''Helper function needed for entropy estimation.
'''
Defined by Grassberger 2003. http://arxiv.org/abs/physics/0307138
'''
if n == 0:
if n == 0:
return 0
return 0
Line 29: Line 32:
'''Naive entropy estimator. Generally biased. Should be avoided.
'''Naive entropy estimator. Generally biased. Should be avoided.
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6]. It counts the number of times a particular observation has
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6].
been made.
It counts the number of times a particular observation has
'''
been made.
'''
N = sum(z) # total number of observations
N = sum(z) # total number of observations
M = len(z) # number of distinct observations
M = len(z) # number of distinct observations
Line 38: Line 42:


def Entropy_H_Miller( z ):
def Entropy_H_Miller( z ):
'''Miller entropy estimator. Simplest correction to the naive estimator.
'''Miller entropy estimator. Simplest correction to the
naive estimator.
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6]. It counts the number of times a particular observation has
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6]. It counts
been made.
the number of times a particular observation has been made.
'''
'''
N = sum(z) # total number of observations
N = sum(z) # total number of observations
M = len(z) # number of distinct observations
M = len(z) # number of distinct observations
Line 49: Line 54:


def Entropy_H_psi( z ):
def Entropy_H_psi( z ):
'''Entropy estimator according to Grassberger 1988. Better then Miller estimator. Based on
'''Entropy estimator according to Grassberger 1988.
the digamma function psi.
Better then Miller estimator. Based on the digamma function psi.
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6]. It counts the number of times a particular observation has
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6].
been made.
It counts the number of times a particular observation has
'''
been made.
'''
N = sum(z) # total number of observations
N = sum(z) # total number of observations
M = len(z) # number of distinct observations
M = len(z) # number of distinct observations
return math.log( N ) - (1./N) * sum([n*(scipy.special.basic.polygamma(0,n)+(-1.)**n/(n*(n+1))) for n in z])
return math.log( N ) - (1./N) * \
sum([n*(polygamma(0,n)+(-1.)**n/(n*(n+1))) for n in z])




def Entropy_H_G( z ):
def Entropy_H_G( z ):
'''Best entropy estimator according to Grassberger 2003. http://arxiv.org/abs/physics/0307138
'''Best entropy estimator according to Grassberger 2003,
http://arxiv.org/abs/physics/0307138
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6]. It counts the number of times a particular observation has
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6]. It counts
been made.
the number of times a particular observation has been made.
'''
'''
N = sum(z) # total number of observations
N = sum(z) # total number of observations
M = len(z) # number of distinct observations
M = len(z) # number of distinct observations
Line 73: Line 81:
print
print
print "z =", z
print "z =", z
print "Entropy estimates:", Entropy_H_naive( z ), Entropy_H_Miller( z ), Entropy_H_psi( z ), Entropy_H_G( z )
print "Entropy estimates:", Entropy_H_naive( z ), Entropy_H_Miller( z ),\
Entropy_H_psi( z ), Entropy_H_G( z )
</pre>
</pre>

Revision as of 14:51, 14 June 2010

Notice: The Wilke Lab page has moved to http://wilkelab.org.
The page you are looking at is kept for archival purposes and will not be further updated.
THE WILKE LAB

Home        Contact        People        Research        Publications        Materials

Concepts from information theory, such as entropy and mutual information, can provide useful insight in bioinformatics applications. One downside of these quantities is that they tend to be difficult to estimate without bias. For example, the naive estimator for entropy, [math]\displaystyle{ H=-(n_i/N) \sum_i \ln(n_i/N), }[/math]where [math]\displaystyle{ n_i }[/math] are the counts of observations of type i and N is the total number of observations, tends to underestimate the true entropy for small samples.

#!/usr/bin/python

import math # for log function

# for polygamma function, needed only by Entropy_H_psi(z).
from scipy.special.basic import *

EulerGamma = 0.57721566490153286060 # Euler Gamma constant

def Gi( n ):
	'''Helper function needed for entropy estimation.
Defined by Grassberger 2003. http://arxiv.org/abs/physics/0307138
'''
	if n == 0:
		return 0
	if n == 1:
		return -EulerGamma-math.log(2)
	if n == 2:
		return 2-EulerGamma-math.log(2)
	if (n % 2) == 1:
		return Gi( n-1 )
	return Gi( n-2 ) + 2./(n-1)



def Entropy_H_naive( z ):
	'''Naive entropy estimator. Generally biased. Should be avoided.
	
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6].
It counts the number of times a particular observation has
been made.
'''
	N = sum(z) # total number of observations
	M = len(z) # number of distinct observations
	return math.log( N ) - (1./N) * sum([n*math.log(n) for n in z])


def Entropy_H_Miller( z ):
	'''Miller entropy estimator. Simplest correction to the
naive estimator.
	
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6]. It counts
the number of times a particular observation has been made.
'''
	N = sum(z) # total number of observations
	M = len(z) # number of distinct observations
	return math.log( N ) - (1./N) * sum([n*(math.log(n)-0.5/n) for n in z])


def Entropy_H_psi( z ):
	'''Entropy estimator according to Grassberger 1988.
Better then Miller estimator. Based on the digamma function psi.
	
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6].
It counts the number of times a particular observation has
been made.
'''
	N = sum(z) # total number of observations
	M = len(z) # number of distinct observations
	return math.log( N ) - (1./N) * \
		sum([n*(polygamma(0,n)+(-1.)**n/(n*(n+1))) for n in z])


def Entropy_H_G( z ):
	'''Best entropy estimator according to Grassberger 2003,
http://arxiv.org/abs/physics/0307138
	
z is a list of integer numbers, e.g., z=[1,4,3,2,2,6]. It counts
the number of times a particular observation has been made.
'''
	N = sum(z) # total number of observations
	M = len(z) # number of distinct observations
	return math.log( N ) - (1./N) * sum([n*Gi(n) for n in z])

z = [1,1,1,1,1,1,1,2,2,2,3,3,5,6,7,8,10,24,147]
print
print "z =", z
print "Entropy estimates:", Entropy_H_naive( z ), Entropy_H_Miller( z ),\
	Entropy_H_psi( z ), Entropy_H_G( z )