Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How can I calculate a p-value for a hypergeometric distribution in Go?

In R, I can calculate a p-value for a hypergeometric distribution by using the phyper() function, of which the first value in the returned array is the p-value.

I was wondering whether there is any package in Go / Golang, that lets me do this calculation completely within Go?

like image 444
Samuel Lampa Avatar asked May 13 '14 15:05

Samuel Lampa


2 Answers

You should check out:

  • probab - Probability distribution functions. Bayesian inference. Written in pure Go.
  • stat - Pure Go implementation of the GSL Statistics library.
  • gostat - A statistics library for the go language
like image 172
Matt Self Avatar answered Sep 24 '22 16:09

Matt Self


When I find problems dealing with stats, my second line of attack after having found that a library does not exist is to port from the R code. This is mixed in ease since code may be R, C/C++ or fortran.

In this case it was pure C, so the port was trivial. Note that the Qhyper() implementation is not an exact port since I have used stirlerr() in place of lgammacor() for the lbeta() implementation. This doesn't seem to make a great deal of difference, but I advise caution if using this lbeta() (and so Qhyper()).

// Direct port of R code from nmath/{phyper,dbinom,stirlerr}.c and {dpq,nmath}.h.
// Code licensed under GPL for that reason (c) Dan Kortschak.
package main

import (
    "errors"
    "fmt"
    "math"
)

func main() {
    // Example values come from:
    // http://stackoverflow.com/questions/8382806/r-hypergeometric-test-phyper
    fmt.Println(Phyper(62, 1998, 5260-1998, 131, true, false))

    for x := 0.; x < 10; x++ {
        fmt.Println(Phyper(x, 10, 7, 8, true, false))
    }
    fmt.Println()
    for x := 0.; x < 10; x++ {
        fmt.Println(Dhyper(x, 10, 7, 8, false))
    }
    fmt.Println()
    for x := 0.; x < 10; x++ {
        fmt.Println(Qhyper(x, 10, 7, 8, true, false))
    }
}

var ErrDomain = errors.New("hyper: argument out of domain")

const (
    epsilon = 2.2204460492503131e-16
    min     = 2.2250738585072014e-308
)

// Sample of n balls from r red and b black ones; x are red
func Phyper(x, r, b, n float64, lowerTail, logP bool) (float64, error) {
    x = math.Floor(x + 1e-7)
    r = round(r)
    b = round(b)
    n = round(n)

    if r < 0 || b < 0 || notFinite(r+b) || n < 0 || n > r+b {
        return math.NaN(), ErrDomain
    }

    if x*(r+b) > n*r {
        b, r = r, b
        x = n - x - 1
        lowerTail = !lowerTail
    }

    if x < 0 {
        return dt0(lowerTail, logP), nil
    }
    if x >= r || x >= n {
        return dt1(lowerTail, logP), nil
    }

    d, err := Dhyper(x, r, b, n, logP)
    if err != nil {
        return d, err
    }
    pd := pdhyper(x, r, b, n, logP)

    if logP {
        return log(d+pd, lowerTail), nil
    }
    res := d * pd
    if lowerTail {
        return res, nil
    }
    // Use 0.5 - p + 0.5 to perhaps gain 1 bit of accuracy
    res = 0.5 - res
    return res + 0.5, nil
}

func Dhyper(x, r, b, n float64, giveLog bool) (float64, error) {
    if negativeOrNotInteger(r) || negativeOrNotInteger(b) || negativeOrNotInteger(n) || n > r+b {
        return math.NaN(), ErrDomain
    }
    if x < 0 {
        return 0, nil
    }
    if x != math.Floor(x) {
        return 0, fmt.Errorf("non-integer x = %f", x)
    }

    x = round(x)
    r = round(r)
    b = round(b)
    n = round(n)

    if n < x || r < x || n-x > b {
        return 0, nil
    }
    if n == 0 {
        if x == 0 {
            return 1, nil
        }
        return 0, nil
    }

    p := n / (r + b)
    q := (r + b - n) / (r + b)

    p1, err := dbinom(x, r, p, q, giveLog)
    if err != nil {
        return math.NaN(), err
    }
    p2, err := dbinom(n-x, b, p, q, giveLog)
    if err != nil {
        return math.NaN(), err
    }
    p3, err := dbinom(n, r+b, p, q, giveLog)
    if err != nil {
        return math.NaN(), err
    }

    if giveLog {
        return p1 + p2 - p3, nil
    }
    return p1 * p2 / p3, nil
}

func Qhyper(p, NR, NB, n float64, lowerTail, logP bool) (float64, error) {
    if notFinite(p) || notFinite(NR) || notFinite(NB) || notFinite(n) {
        return math.NaN(), ErrDomain
    }

    NR = round(NR)
    NB = round(NB)
    N := NR + NB
    n = round(n)
    if NR < 0 || NB < 0 || n < 0 || n > N {
        return math.NaN(), ErrDomain
    }

    /* Goal: Find xr (= #{red balls in sample}) such that
    * phyper(xr, NR,NB, n) >= p > phyper(xr - 1, NR,NB, n)
     */

    xstart := math.Max(0, n-NB)
    xend := math.Min(n, NR)

    if logP {
        if p > 0 {
            return math.NaN(), ErrDomain
        }
        if p == 0 { /* upper bound*/
            if lowerTail {
                return xend, nil
            }
            return xstart, nil
        }
        if math.IsInf(p, -1) {
            if lowerTail {
                return xstart, nil
            }
            return xend, nil
        }
    } else { /* !logP */
        if p < 0 || p > 1 {
            return math.NaN(), ErrDomain
        }
        if p == 0 {
            if lowerTail {
                return xstart, nil
            }
            return xend, nil
        }
        if p == 1 {
            if lowerTail {
                return xend, nil
            }
            return xstart, nil
        }
    }

    xr := xstart
    xb := n - xr /* always ( = #{black balls in sample} ) */

    smallN := N < 1000 /* won't have underflow in product below */
    /* if N is small, term := product.ratio( bin.coef );
    otherwise work with its logarithm to protect against underflow */
    t1, err := lfastchoose(NR, xr)
    if err != nil {
        return 0, err
    }
    t2, err := lfastchoose(NB, xb)
    if err != nil {
        return 0, err
    }
    t3, err := lfastchoose(N, n)
    if err != nil {
        return 0, err
    }
    term := t1 + t2 - t3
    if smallN {
        term = math.Exp(term)
    }
    NR -= xr
    NB -= xb

    if !lowerTail || logP {
        p = qIv(p, lowerTail, logP)
    }
    p *= 1 - 1000*epsilon /* was 64, but failed on FreeBSD sometimes */
    var sum float64
    if smallN {
        sum = term
    } else {
        sum = math.Exp(term)
    }

    for sum < p && xr < xend {
        xr++
        NB++
        if smallN {
            term *= (NR / xr) * (xb / NB)
        } else {
            term += math.Log((NR / xr) * (xb / NB))
        }
        if smallN {
            sum += term
        } else {
            sum += math.Exp(term)
        }
        xb--
        NR--
    }
    return xr, nil
}

func lfastchoose(n, k float64) (float64, error) {
    lb, err := lbeta(n-k+1, k+1)
    if err != nil {
        return math.NaN(), err
    }
    return -math.Log(n+1) - lb, nil
}

func lbeta(a, b float64) (float64, error) {
    p := a
    q := a
    if b < p {
        p = b
    } /* := min(a,b) */
    if b > q {
        q = b
    } /* := max(a,b) */

    /* both arguments must be >= 0 */
    if p < 0 {
        return math.NaN(), ErrDomain
    } else if p == 0 {
        return math.Inf(1), nil
    } else if notFinite(q) { /* q == +Inf */
        return math.Inf(1), nil
    }

    if p >= 10 {
        /* p and q are big. */
        corr := stirlerr(p) + stirlerr(q) - stirlerr(p+q)
        return math.Log(q)*-0.5 + logSqrt2Pi + corr + (p-0.5)*math.Log(p/(p+q)) + q*math.Log1p(-p/(p+q)), nil
    } else if q >= 10 {
        /* p is small, but q is big. */
        corr := stirlerr(q) - stirlerr(p+q)
        return math.Gamma(p) + corr + p - p*math.Log(p+q) + (q-0.5)*math.Log1p(-p/(p+q)), nil
    } else {
        /* p and q are small: p <= q < 10. */
        /* R change for very small args */
        if p < min {
            return lgamma(p) + (lgamma(q) - lgamma(p+q)), nil
        }
    }
    return math.Log(math.Gamma(p) * (math.Gamma(q) / math.Gamma(p+q))), nil
}

func lgamma(p float64) float64 {
    r, _ := math.Lgamma(p)
    return r
}

func qIv(p float64, lowerTail, logP bool) float64 {
    if logP {
        if lowerTail {
            return math.Exp(p)
        }
        return -math.Expm1(p)
    }
    if lowerTail {
        return p
    }
    p = 0.5 - p
    return p + 0.5
}

// Calculate
//
// phyper (x, r, b, n, TRUE, FALSE)
// [log] ----------------------------------
// dhyper (x, r, b, n, FALSE)
//
// without actually calling phyper. This assumes that
//
// x * (r + b) <= n * r
func pdhyper(x, r, b, n float64, logP bool) float64 {

    sum := 0.
    term := 1.

    for x > 0 && term >= epsilon*sum {
        term *= x * (b - n + x) / (n + 1 - x) / (r + 1 - x)
        sum += term
        x--
    }

    if logP {
        return math.Log1p(sum)
    }
    return 1 + sum
}

var (
    ln2   = math.Log(2)
    ln2Pi = math.Log(2 * math.Pi)
)

func log(x float64, lowerTail bool) float64 {
    if lowerTail {
        return math.Log(x)
    }
    if x > -ln2 {
        return math.Log(-math.Expm1(x))
    }
    return math.Log1p(-math.Exp(x))
}

func dbinom(x, n, p, q float64, giveLog bool) (float64, error) {
    if p == 0 {
        if x == 0 {
            return 1, nil
        }
        return 0, nil
    }
    if q == 0 {
        if x == n {
            return 1, nil
        }
        return 0, nil
    }

    if x == 0 {
        if n == 0 {
            return 1, nil
        }
        if p < 0.1 {
            t, err := bd0(n, n*q)
            if err != nil {
                return math.NaN(), err
            }
            return exp(-t-n*p, giveLog), nil
        }
        return exp(n*math.Log(q), giveLog), nil
    }
    if x == n {
        if q < 0.1 {
            t, err := bd0(n, n*p)
            if err != nil {
                return math.NaN(), err
            }
            return exp(-t-n*q, giveLog), nil
        }
        return exp(n*math.Log(p), giveLog), nil
    }
    if x < 0 || x > n {
        return 0, nil
    }

    // n*p or n*q can underflow to zero if n and p or q are small. This
    // used to occur in dbeta, and gives NaN as from R 2.3.0.
    t1, err := bd0(x, n*p)
    if err != nil {
        return math.NaN(), err
    }
    t2, err := bd0(n-x, n*q)
    if err != nil {
        return math.NaN(), err
    }
    lc := stirlerr(n) - stirlerr(x) - stirlerr(n-x) - t1 - t2

    // f = (M_2PI*x*(n-x))/n; could overflow or underflow
    // Upto R 2.7.1:
    // lf = log(M_2PI) + log(x) + log(n-x) - log(n);
    // -- following is much better for x << n :
    lf := ln2Pi + math.Log(x) + math.Log1p(-x/n)

    return exp(lc-0.5*lf, giveLog), nil
}

func negativeOrNotInteger(x float64) bool {
    return x < 0 || x != math.Floor(x)
}

func notFinite(x float64) bool {
    return math.IsNaN(x) || math.IsInf(x, 0)
}

func round(x float64) float64 {
    if _, frac := math.Modf(x); frac >= 0.5 {
        return math.Ceil(x)
    }
    return math.Floor(x)
}

func exp(x float64, giveLog bool) float64 {
    if giveLog {
        return x
    }
    return math.Exp(x)
}

func dt0(lowerTail, logP bool) float64 {
    if lowerTail {
        return d0(logP)
    }
    return d1(logP)
}

func dt1(lowerTail, logP bool) float64 {
    if lowerTail {
        return d1(logP)
    }
    return d0(logP)
}

func d0(logP bool) float64 {
    if logP {
        return math.Inf(-1)
    }
    return 0
}

func d1(logP bool) float64 {
    if logP {
        return 0
    }
    return 1
}

// bd0(x,M) :=  M * D0(x/M) = M*[ x/M * log(x/M) + 1 - (x/M) ] =
//       =  x * log(x/M) + M - x
// where M = E[X] = n*p (or = lambda), for   x, M > 0
//
// in a manner that should be stable (with small relative error)
// for all x and M=np. In particular for x/np close to 1, direct
// evaluation fails, and evaluation is based on the Taylor series
// of log((1+v)/(1-v)) with v = (x-M)/(x+M) = (x-np)/(x+np).
//
func bd0(x, np float64) (float64, error) {
    if notFinite(x) || notFinite(np) || np == 0 {
        return math.NaN(), ErrDomain
    }

    if math.Abs(x-np) < 0.1*(x+np) {
        v := (x - np) / (x + np) // might underflow to 0
        s := (x - np) * v        // s using v -- change by MM
        if math.Abs(s) < min {
            return s, nil
        }
        ej := 2 * x * v
        v = v * v
        for j := 1; j < 1000; j++ {
            // Taylor series; 1000: no infinite loop
            // as |v| < .1,  v^2000 is "zero"
            ej *= v // = v^(2j+1)
            s1 := s + ej/float64((j<<1)+1)
            if s1 == s { // last term was effectively 0
                return s1, nil
            }
            s = s1
        }
    }
    /* else:  | x - np |  is not too small */
    return x*math.Log(x/np) + np - x, nil
}

var (
    // error for 0, 0.5, 1.0, 1.5, ..., 14.5, 15.0.
    sfErrHalves = [31]float64{
        0.0, // n=0 - wrong, place holder only
        0.1534264097200273452913848,   // 0.5
        0.0810614667953272582196702,   // 1.0
        0.0548141210519176538961390,   // 1.5
        0.0413406959554092940938221,   // 2.0
        0.03316287351993628748511048,  // 2.5
        0.02767792568499833914878929,  // 3.0
        0.02374616365629749597132920,  // 3.5
        0.02079067210376509311152277,  // 4.0
        0.01848845053267318523077934,  // 4.5
        0.01664469118982119216319487,  // 5.0
        0.01513497322191737887351255,  // 5.5
        0.01387612882307074799874573,  // 6.0
        0.01281046524292022692424986,  // 6.5
        0.01189670994589177009505572,  // 7.0
        0.01110455975820691732662991,  // 7.5
        0.010411265261972096497478567, // 8.0
        0.009799416126158803298389475, // 8.5
        0.009255462182712732917728637, // 9.0
        0.008768700134139385462952823, // 9.5
        0.008330563433362871256469318, // 10.0
        0.007934114564314020547248100, // 10.5
        0.007573675487951840794972024, // 11.0
        0.007244554301320383179543912, // 11.5
        0.006942840107209529865664152, // 12.0
        0.006665247032707682442354394, // 12.5
        0.006408994188004207068439631, // 13.0
        0.006171712263039457647532867, // 13.5
        0.005951370112758847735624416, // 14.0
        0.005746216513010115682023589, // 14.5
        0.005554733551962801371038690, // 15.0
    }

    logSqrt2Pi = math.Log(math.Sqrt(2 * math.Pi))
)

// stirlerr(n) = log(n!) - log( sqrt(2*pi*n)*(n/e)^n )
//             = log Gamma(n+1) - 1/2 * [log(2*pi) + log(n)] - n*[log(n) - 1]
//             = log Gamma(n+1) - (n + 1/2) * log(n) + n - log(2*pi)/2
func stirlerr(n float64) float64 {
    const (
        S0 = 1. / 12.
        S1 = 1. / 360.
        S2 = 1. / 1260.
        S3 = 1. / 1680.
        S4 = 1. / 1188.
    )

    var nn float64

    if n <= 15.0 {
        nn = n + n
        if nn == math.Floor(nn) {
            return sfErrHalves[int(nn)]
        }
        lg, _ := math.Lgamma(n + 1)
        return lg - (n+0.5)*math.Log(n) + n - logSqrt2Pi
    }

    nn = n * n
    switch {
    case n > 500:
        return ((S0 - S1/nn) / n)
    case n > 80:
        return ((S0 - (S1-S2/nn)/nn) / n)
    case n > 35:
        return ((S0 - (S1-(S2-S3/nn)/nn)/nn) / n)
    default: // 15 < n <= 35
        return (S0 - (S1-(S2-(S3-S4/nn)/nn)/nn)/nn) / n
    }
}
like image 44
kortschak Avatar answered Sep 24 '22 16:09

kortschak