I have one set of data in python. I am plotting this as a histogram, this plot shows a bimodal distribution, therefore I am trying to plot two gaussian profiles over each peak in the bimodality.
If i use the code below is requires me to have two datasets with the same size. however I just have one dataset, and this cannot be divided equally. How can I fit these two gaussians
from sklearn import mixture
import matplotlib.pyplot
import matplotlib.mlab
import numpy as np
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit(yourdata)
m1, m2 = clf.means_
w1, w2 = clf.weights_
c1, c2 = clf.covars_
histdist = matplotlib.pyplot.hist(yourdata, 100, normed=True)
plotgauss1 = lambda x: plot(x,w1*matplotlib.mlab.normpdf(x,m1,np.sqrt(c1))[0], linewidth=3)
plotgauss2 = lambda x: plot(x,w2*matplotlib.mlab.normpdf(x,m2,np.sqrt(c2))[0], linewidth=3)
plotgauss1(histdist[1])
plotgauss2(histdist[1])
Here a simulation with scipy tools :
from pylab import *
from scipy.optimize import curve_fit
data=concatenate((normal(1,.2,5000),normal(2,.2,2500)))
y,x,_=hist(data,100,alpha=.3,label='data')
x=(x[1:]+x[:-1])/2 # for len(x)==len(y)
def gauss(x,mu,sigma,A):
return A*exp(-(x-mu)**2/2/sigma**2)
def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)
expected=(1,.2,250,2,.2,125)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')
legend()
print(params,'\n',sigma)
The data is the superposition of two normal samples, the model a sum of Gaussian curves. we obtain :
And the estimate parameters are :
# via pandas :
# pd.DataFrame(data={'params':params,'sigma':sigma},index=bimodal.__code__.co_varnames[1:])
params sigma
mu1 0.999447 0.002683
sigma1 0.202465 0.002696
A1 226.296279 2.597628
mu2 2.003028 0.005036
sigma2 0.193235 0.005058
A2 117.823706 2.658789
This can be achieved in a clean and simple way using sklearn
Python library:
import numpy as np
from sklearn.mixture import GaussianMixture
from pylab import concatenate, normal
# First normal distribution parameters
mu1 = 1
sigma1 = 0.1
# Second normal distribution parameters
mu2 = 2
sigma2 = 0.2
w1 = 2/3 # Proportion of samples from first distribution
w2 = 1/3 # Proportion of samples from second distribution
n = 7500 # Total number of samples
n1 = int(n*w1) # Number of samples from first distribution
n2 = int(n*w2) # Number of samples from second distribution
# Generate n1 samples from the first normal distribution and n2 samples from the second normal distribution
X = concatenate((normal(mu1, sigma1, n1), normal(mu2, sigma2, n2))).reshape(-1, 1)
# Determine parameters mu1, mu2, sigma1, sigma2, w1 and w2
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
print(f'mu1={gm.means_[0]}, mu2={gm.means_[1]}')
print(f'sigma1={np.sqrt(gm.covariances_[0])}, sigma2={np.sqrt(gm.covariances_[1])}')
print(f'w1={gm.weights_[0]}, w2={gm.weights_[1]}')
print(f'n1={int(n * gm.weights_[0])} n2={int(n * gm.weights_[1])}')
As the use of pylab is now strongly discouraged by matplotlib because of namespace cluttering, I have rewritten the script by B.M. and added a plot of the two components:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
#data generation
np.random.seed(123)
data=np.concatenate((np.random.normal(1, .2, 5000), np.random.normal(1.6, .3, 2500)))
y,x,_=plt.hist(data, 100, alpha=.3, label='data')
x=(x[1:]+x[:-1])/2 # for len(x)==len(y)
#x, y inputs can be lists or 1D numpy arrays
def gauss(x, mu, sigma, A):
return A*np.exp(-(x-mu)**2/2/sigma**2)
def bimodal(x, mu1, sigma1, A1, mu2, sigma2, A2):
return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)
expected = (1, .2, 250, 2, .2, 125)
params, cov = curve_fit(bimodal, x, y, expected)
sigma=np.sqrt(np.diag(cov))
x_fit = np.linspace(x.min(), x.max(), 500)
#plot combined...
plt.plot(x_fit, bimodal(x_fit, *params), color='red', lw=3, label='model')
#...and individual Gauss curves
plt.plot(x_fit, gauss(x_fit, *params[:3]), color='red', lw=1, ls="--", label='distribution 1')
plt.plot(x_fit, gauss(x_fit, *params[3:]), color='red', lw=1, ls=":", label='distribution 2')
#and the original data points if no histogram has been created before
#plt.scatter(x, y, marker="X", color="black", label="original data")
plt.legend()
print(pd.DataFrame(data={'params': params, 'sigma': sigma}, index=bimodal.__code__.co_varnames[1:]))
plt.show()
params sigma
mu1 1.014589 0.005273
sigma1 0.203826 0.004067
A1 230.654585 3.667409
mu2 1.635225 0.022423
sigma2 0.282690 0.019070
A2 72.621856 2.148459
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With