Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Fit mixture of two gaussian/normal distributions to a histogram from one set of data, python

I have one set of data in python. I am plotting this as a histogram, this plot shows a bimodal distribution, therefore I am trying to plot two gaussian profiles over each peak in the bimodality.

If i use the code below is requires me to have two datasets with the same size. however I just have one dataset, and this cannot be divided equally. How can I fit these two gaussians

from sklearn import mixture
import matplotlib.pyplot
import matplotlib.mlab
import numpy as np
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit(yourdata)
m1, m2 = clf.means_
w1, w2 = clf.weights_
c1, c2 = clf.covars_
histdist = matplotlib.pyplot.hist(yourdata, 100, normed=True)
plotgauss1 = lambda x: plot(x,w1*matplotlib.mlab.normpdf(x,m1,np.sqrt(c1))[0], linewidth=3)
plotgauss2 = lambda x: plot(x,w2*matplotlib.mlab.normpdf(x,m2,np.sqrt(c2))[0], linewidth=3)
plotgauss1(histdist[1])
plotgauss2(histdist[1])
like image 506
astrochris Avatar asked Mar 14 '16 14:03

astrochris


3 Answers

Here a simulation with scipy tools :

from pylab import *
from scipy.optimize import curve_fit

data=concatenate((normal(1,.2,5000),normal(2,.2,2500)))
y,x,_=hist(data,100,alpha=.3,label='data')

x=(x[1:]+x[:-1])/2 # for len(x)==len(y)

def gauss(x,mu,sigma,A):
    return A*exp(-(x-mu)**2/2/sigma**2)

def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
    return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)

expected=(1,.2,250,2,.2,125)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')
legend()
print(params,'\n',sigma)    

The data is the superposition of two normal samples, the model a sum of Gaussian curves. we obtain :

gauss with legend

And the estimate parameters are :

# via pandas :
# pd.DataFrame(data={'params':params,'sigma':sigma},index=bimodal.__code__.co_varnames[1:])
            params     sigma
mu1       0.999447  0.002683
sigma1    0.202465  0.002696
A1      226.296279  2.597628
mu2       2.003028  0.005036
sigma2    0.193235  0.005058
A2      117.823706  2.658789
like image 165
B. M. Avatar answered Nov 03 '22 00:11

B. M.


This can be achieved in a clean and simple way using sklearn Python library:

import numpy as np
from sklearn.mixture import GaussianMixture
from pylab import concatenate, normal

# First normal distribution parameters
mu1 = 1
sigma1 = 0.1

# Second normal distribution parameters
mu2 = 2
sigma2 = 0.2

w1 = 2/3 # Proportion of samples from first distribution
w2 = 1/3 # Proportion of samples from second distribution

n = 7500 # Total number of samples

n1 = int(n*w1) # Number of samples from first distribution
n2 = int(n*w2) # Number of samples from second distribution

# Generate n1 samples from the first normal distribution and n2 samples from the second normal distribution
X = concatenate((normal(mu1, sigma1, n1), normal(mu2, sigma2, n2))).reshape(-1, 1)

# Determine parameters mu1, mu2, sigma1, sigma2, w1 and w2
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
print(f'mu1={gm.means_[0]}, mu2={gm.means_[1]}')
print(f'sigma1={np.sqrt(gm.covariances_[0])}, sigma2={np.sqrt(gm.covariances_[1])}')
print(f'w1={gm.weights_[0]}, w2={gm.weights_[1]}')
print(f'n1={int(n * gm.weights_[0])} n2={int(n * gm.weights_[1])}')
like image 38
Tarik Avatar answered Nov 02 '22 23:11

Tarik


As the use of pylab is now strongly discouraged by matplotlib because of namespace cluttering, I have rewritten the script by B.M. and added a plot of the two components:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit

#data generation
np.random.seed(123)
data=np.concatenate((np.random.normal(1, .2, 5000), np.random.normal(1.6, .3, 2500)))
y,x,_=plt.hist(data, 100, alpha=.3, label='data')
x=(x[1:]+x[:-1])/2 # for len(x)==len(y)

#x, y inputs can be lists or 1D numpy arrays

def gauss(x, mu, sigma, A):
    return A*np.exp(-(x-mu)**2/2/sigma**2)

def bimodal(x, mu1, sigma1, A1, mu2, sigma2, A2):
    return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)

expected = (1, .2, 250, 2, .2, 125)
params, cov = curve_fit(bimodal, x, y, expected)
sigma=np.sqrt(np.diag(cov))
x_fit = np.linspace(x.min(), x.max(), 500)
#plot combined...
plt.plot(x_fit, bimodal(x_fit, *params), color='red', lw=3, label='model')
#...and individual Gauss curves
plt.plot(x_fit, gauss(x_fit, *params[:3]), color='red', lw=1, ls="--", label='distribution 1')
plt.plot(x_fit, gauss(x_fit, *params[3:]), color='red', lw=1, ls=":", label='distribution 2')
#and the original data points if no histogram has been created before
#plt.scatter(x, y, marker="X", color="black", label="original data")
plt.legend()
print(pd.DataFrame(data={'params': params, 'sigma': sigma}, index=bimodal.__code__.co_varnames[1:]))
plt.show() 

enter image description here

            params     sigma
mu1       1.014589  0.005273
sigma1    0.203826  0.004067
A1      230.654585  3.667409
mu2       1.635225  0.022423
sigma2    0.282690  0.019070
A2       72.621856  2.148459
like image 2
Mr. T Avatar answered Nov 02 '22 22:11

Mr. T