Fit mixture of two gaussian/normal distributions to a histogram from one set of data, python

Question

I have one set of data in python. I am plotting this as a histogram, this plot shows a bimodal distribution, therefore I am trying to plot two gaussian profiles over each peak in the bimodality.

If i use the code below is requires me to have two datasets with the same size. however I just have one dataset, and this cannot be divided equally. How can I fit these two gaussians

from sklearn import mixture
import matplotlib.pyplot
import matplotlib.mlab
import numpy as np
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit(yourdata)
m1, m2 = clf.means_
w1, w2 = clf.weights_
c1, c2 = clf.covars_
histdist = matplotlib.pyplot.hist(yourdata, 100, normed=True)
plotgauss1 = lambda x: plot(x,w1*matplotlib.mlab.normpdf(x,m1,np.sqrt(c1))[0], linewidth=3)
plotgauss2 = lambda x: plot(x,w2*matplotlib.mlab.normpdf(x,m2,np.sqrt(c2))[0], linewidth=3)
plotgauss1(histdist[1])
plotgauss2(histdist[1])

B. M. · Accepted Answer

Here a simulation with scipy tools :

from pylab import *
from scipy.optimize import curve_fit

data=concatenate((normal(1,.2,5000),normal(2,.2,2500)))
y,x,_=hist(data,100,alpha=.3,label='data')

x=(x[1:]+x[:-1])/2 # for len(x)==len(y)

def gauss(x,mu,sigma,A):
    return A*exp(-(x-mu)**2/2/sigma**2)

def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
    return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)

expected=(1,.2,250,2,.2,125)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')
legend()
print(params,'
',sigma)

The data is the superposition of two normal samples, the model a sum of Gaussian curves. we obtain :

gauss with legend

And the estimate parameters are :

# via pandas :
# pd.DataFrame(data={'params':params,'sigma':sigma},index=bimodal.__code__.co_varnames[1:])
            params     sigma
mu1       0.999447  0.002683
sigma1    0.202465  0.002696
A1      226.296279  2.597628
mu2       2.003028  0.005036
sigma2    0.193235  0.005058
A2      117.823706  2.658789

Tarik · Answer

This can be achieved in a clean and simple way using sklearn Python library:

import numpy as np
from sklearn.mixture import GaussianMixture
from pylab import concatenate, normal

# First normal distribution parameters
mu1 = 1
sigma1 = 0.1

# Second normal distribution parameters
mu2 = 2
sigma2 = 0.2

w1 = 2/3 # Proportion of samples from first distribution
w2 = 1/3 # Proportion of samples from second distribution

n = 7500 # Total number of samples

n1 = int(n*w1) # Number of samples from first distribution
n2 = int(n*w2) # Number of samples from second distribution

# Generate n1 samples from the first normal distribution and n2 samples from the second normal distribution
X = concatenate((normal(mu1, sigma1, n1), normal(mu2, sigma2, n2))).reshape(-1, 1)

# Determine parameters mu1, mu2, sigma1, sigma2, w1 and w2
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
print(f'mu1={gm.means_[0]}, mu2={gm.means_[1]}')
print(f'sigma1={np.sqrt(gm.covariances_[0])}, sigma2={np.sqrt(gm.covariances_[1])}')
print(f'w1={gm.weights_[0]}, w2={gm.weights_[1]}')
print(f'n1={int(n * gm.weights_[0])} n2={int(n * gm.weights_[1])}')

Mr. T · Answer

As the use of pylab is now strongly discouraged by matplotlib because of namespace cluttering, I have rewritten the script by B.M. and added a plot of the two components:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit

#data generation
np.random.seed(123)
data=np.concatenate((np.random.normal(1, .2, 5000), np.random.normal(1.6, .3, 2500)))
y,x,_=plt.hist(data, 100, alpha=.3, label='data')
x=(x[1:]+x[:-1])/2 # for len(x)==len(y)

#x, y inputs can be lists or 1D numpy arrays

def gauss(x, mu, sigma, A):
    return A*np.exp(-(x-mu)**2/2/sigma**2)

def bimodal(x, mu1, sigma1, A1, mu2, sigma2, A2):
    return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)

expected = (1, .2, 250, 2, .2, 125)
params, cov = curve_fit(bimodal, x, y, expected)
sigma=np.sqrt(np.diag(cov))
x_fit = np.linspace(x.min(), x.max(), 500)
#plot combined...
plt.plot(x_fit, bimodal(x_fit, *params), color='red', lw=3, label='model')
#...and individual Gauss curves
plt.plot(x_fit, gauss(x_fit, *params[:3]), color='red', lw=1, ls="--", label='distribution 1')
plt.plot(x_fit, gauss(x_fit, *params[3:]), color='red', lw=1, ls=":", label='distribution 2')
#and the original data points if no histogram has been created before
#plt.scatter(x, y, marker="X", color="black", label="original data")
plt.legend()
print(pd.DataFrame(data={'params': params, 'sigma': sigma}, index=bimodal.__code__.co_varnames[1:]))
plt.show()

enter image description here

            params     sigma
mu1       1.014589  0.005273
sigma1    0.203826  0.004067
A1      230.654585  3.667409
mu2       1.635225  0.022423
sigma2    0.282690  0.019070
A2       72.621856  2.148459

Fit mixture of two gaussian/normal distributions to a histogram from one set of data, python

Tags:

python

numpy

scipy

astrochris

3 Answers

B. M.

Tarik

Mr. T

Recent Activity

Donate For Us

Fit mixture of two gaussian/normal distributions to a histogram from one set of data, python

Tags:

python

numpy

scipy

astrochris

3 Answers

B. M.

Tarik

Mr. T

Related questions

Recent Activity

Donate For Us