Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Linear regression library for Go language

I'm looking for a Go library that implements linear regression with MLE or LSE. Has anyone seen one?

There is this stats library, but it doesn't seem to have what I need: https://github.com/grd/statistics

Thanks!

like image 697
user1094206 Avatar asked May 07 '13 14:05

user1094206


2 Answers

Implementing an LSE (Least Squared Error) linear regression is fairly simple.

Here's an implementation in JavaScript - it should be trivial to port to Go.


Here's an (untested) port:

package main

import "fmt"

type Point struct {
    X float64
    Y float64
}

func linearRegressionLSE(series []Point) []Point {

    q := len(series)

    if q == 0 {
        return make([]Point, 0, 0)
    }

    p := float64(q)

    sum_x, sum_y, sum_xx, sum_xy := 0.0, 0.0, 0.0, 0.0

    for _, p := range series {
        sum_x += p.X
        sum_y += p.Y
        sum_xx += p.X * p.X
        sum_xy += p.X * p.Y
    }

    m := (p*sum_xy - sum_x*sum_y) / (p*sum_xx - sum_x*sum_x)
    b := (sum_y / p) - (m * sum_x / p)

    r := make([]Point, q, q)

    for i, p := range series {
        r[i] = Point{p.X, (p.X*m + b)}
    }

    return r
}

func main() {
    // ...
}
like image 91
thwd Avatar answered Oct 22 '22 14:10

thwd


I have implemented the following using gradient descent, it only gives the coefficients but takes any number of explanatory variables and is reasonably accurate:

package main

import "fmt"

func calc_ols_params(y []float64, x[][]float64, n_iterations int, alpha float64) []float64 {

    thetas := make([]float64, len(x))

    for i := 0; i < n_iterations; i++ {

        my_diffs := calc_diff(thetas, y, x)

        my_grad := calc_gradient(my_diffs, x)

        for j := 0; j < len(my_grad); j++ {
            thetas[j] += alpha * my_grad[j]
        }
    }
    return thetas
}

func calc_diff (thetas []float64, y []float64, x[][]float64) []float64 {
    diffs := make([]float64, len(y))
    for i := 0; i < len(y); i++ {
        prediction := 0.0
        for j := 0; j < len(thetas); j++ {
            prediction += thetas[j] * x[j][i]
        }
        diffs[i] = y[i] - prediction
    }
    return diffs
}

func calc_gradient(diffs[] float64, x[][]float64) []float64 {
    gradient := make([]float64, len(x))
    for i := 0; i < len(diffs); i++ {
        for j := 0; j < len(x); j++ {
            gradient[j] += diffs[i] * x[j][i]
        }
    }
    for i := 0; i < len(x); i++ {
        gradient[i] = gradient[i] / float64(len(diffs))
    }

    return gradient
}

func main(){
    y := []float64 {3,4,5,6,7}
    x := [][]float64 {{1,1,1,1,1}, {4,3,2,1,3}}

    thetas := calc_ols_params(y, x, 100000, 0.001)

    fmt.Println("Thetas : ", thetas)

    y_2 := []float64 {1,2,3,4,3,4,5,4,5,5,4,5,4,5,4,5,6,5,4,5,4,3,4}

    x_2 := [][]float64 {{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
                            {4,2,3,4,5,4,5,6,7,4,8,9,8,8,6,6,5,5,5,5,5,5,5},
                    {4,1,2,3,4,5,6,7,5,8,7,8,7,8,7,8,7,7,7,7,7,6,5},
                    {4,1,2,5,6,7,8,9,7,8,7,8,7,7,7,7,7,7,6,6,4,4,4},}

    thetas_2 := calc_ols_params(y_2, x_2, 100000, 0.001)

    fmt.Println("Thetas_2 : ", thetas_2)

}

Result:

Thetas :  [6.999959251448524 -0.769216974483968]
Thetas_2 :  [1.5694174539341945 -0.06169183063112409 0.2359981255871977 0.2424327101610395]

go playground

I checked my results with python.pandas and they were very close:

In [24]: from pandas.stats.api import ols

In [25]: df = pd.DataFrame(np.array(x).T, columns=['x1','x2','x3','y'])

In [26]: from pandas.stats.api import ols

In [27]: x = [
     [4,2,3,4,5,4,5,6,7,4,8,9,8,8,6,6,5,5,5,5,5,5,5],
     [4,1,2,3,4,5,6,7,5,8,7,8,7,8,7,8,7,7,7,7,7,6,5],
     [4,1,2,5,6,7,8,9,7,8,7,8,7,7,7,7,7,7,6,6,4,4,4]
     ]

In [28]: y = [1,2,3,4,3,4,5,4,5,5,4,5,4,5,4,5,6,5,4,5,4,3,4]

In [29]: x.append(y)

In [30]: df = pd.DataFrame(np.array(x).T, columns=['x1','x2','x3','y'])

In [31]: ols(y=df['y'], x=df[['x1', 'x2', 'x3']])
Out[31]: 

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x1> + <x2> + <x3> + <intercept>

Number of Observations:         23
Number of Degrees of Freedom:   4

R-squared:         0.5348
Adj R-squared:     0.4614

Rmse:              0.8254

F-stat (3, 19):     7.2813, p-value:     0.0019

Degrees of Freedom: model 3, resid 19

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
            x1    -0.0618     0.1446      -0.43     0.6741    -0.3453     0.2217
            x2     0.2360     0.1487       1.59     0.1290    -0.0554     0.5274
            x3     0.2424     0.1394       1.74     0.0983    -0.0309     0.5156
     intercept     1.5704     0.6331       2.48     0.0226     0.3296     2.8113
---------------------------------End of Summary---------------------------------

and

In [34]: df_1 = pd.DataFrame(np.array([[3,4,5,6,7], [4,3,2,1,3]]).T, columns=['y', 'x'])

In [35]: df_1
Out[35]: 
   y  x
0  3  4
1  4  3
2  5  2
3  6  1
4  7  3

[5 rows x 2 columns]

In [36]: ols(y=df_1['y'], x=df_1['x'])
Out[36]: 

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         5
Number of Degrees of Freedom:   2

R-squared:         0.3077
Adj R-squared:     0.0769

Rmse:              1.5191

F-stat (1, 3):     1.3333, p-value:     0.3318

Degrees of Freedom: model 1, resid 3

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x    -0.7692     0.6662      -1.15     0.3318    -2.0749     0.5365
     intercept     7.0000     1.8605       3.76     0.0328     3.3534    10.6466
---------------------------------End of Summary---------------------------------


In [37]: df_1 = pd.DataFrame(np.array([[3,4,5,6,7], [4,3,2,1,3]]).T, columns=['y', 'x'])

In [38]: ols(y=df_1['y'], x=df_1['x'])
Out[38]: 

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         5
Number of Degrees of Freedom:   2

R-squared:         0.3077
Adj R-squared:     0.0769

Rmse:              1.5191

F-stat (1, 3):     1.3333, p-value:     0.3318

Degrees of Freedom: model 1, resid 3

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x    -0.7692     0.6662      -1.15     0.3318    -2.0749     0.5365
     intercept     7.0000     1.8605       3.76     0.0328     3.3534    10.6466
---------------------------------End of Summary---------------------------------
like image 36
Akavall Avatar answered Oct 22 '22 15:10

Akavall