In R, I can calculate a p-value for a hypergeometric distribution by using the phyper() function, of which the first value in the returned array is the p-value.
I was wondering whether there is any package in Go / Golang, that lets me do this calculation completely within Go?
You should check out:
When I find problems dealing with stats, my second line of attack after having found that a library does not exist is to port from the R code. This is mixed in ease since code may be R, C/C++ or fortran.
In this case it was pure C, so the port was trivial. Note that the Qhyper()
implementation is not an exact port since I have used stirlerr()
in place of lgammacor()
for the lbeta()
implementation. This doesn't seem to make a great deal of difference, but I advise caution if using this lbeta()
(and so Qhyper()
).
// Direct port of R code from nmath/{phyper,dbinom,stirlerr}.c and {dpq,nmath}.h.
// Code licensed under GPL for that reason (c) Dan Kortschak.
package main
import (
"errors"
"fmt"
"math"
)
func main() {
// Example values come from:
// http://stackoverflow.com/questions/8382806/r-hypergeometric-test-phyper
fmt.Println(Phyper(62, 1998, 5260-1998, 131, true, false))
for x := 0.; x < 10; x++ {
fmt.Println(Phyper(x, 10, 7, 8, true, false))
}
fmt.Println()
for x := 0.; x < 10; x++ {
fmt.Println(Dhyper(x, 10, 7, 8, false))
}
fmt.Println()
for x := 0.; x < 10; x++ {
fmt.Println(Qhyper(x, 10, 7, 8, true, false))
}
}
var ErrDomain = errors.New("hyper: argument out of domain")
const (
epsilon = 2.2204460492503131e-16
min = 2.2250738585072014e-308
)
// Sample of n balls from r red and b black ones; x are red
func Phyper(x, r, b, n float64, lowerTail, logP bool) (float64, error) {
x = math.Floor(x + 1e-7)
r = round(r)
b = round(b)
n = round(n)
if r < 0 || b < 0 || notFinite(r+b) || n < 0 || n > r+b {
return math.NaN(), ErrDomain
}
if x*(r+b) > n*r {
b, r = r, b
x = n - x - 1
lowerTail = !lowerTail
}
if x < 0 {
return dt0(lowerTail, logP), nil
}
if x >= r || x >= n {
return dt1(lowerTail, logP), nil
}
d, err := Dhyper(x, r, b, n, logP)
if err != nil {
return d, err
}
pd := pdhyper(x, r, b, n, logP)
if logP {
return log(d+pd, lowerTail), nil
}
res := d * pd
if lowerTail {
return res, nil
}
// Use 0.5 - p + 0.5 to perhaps gain 1 bit of accuracy
res = 0.5 - res
return res + 0.5, nil
}
func Dhyper(x, r, b, n float64, giveLog bool) (float64, error) {
if negativeOrNotInteger(r) || negativeOrNotInteger(b) || negativeOrNotInteger(n) || n > r+b {
return math.NaN(), ErrDomain
}
if x < 0 {
return 0, nil
}
if x != math.Floor(x) {
return 0, fmt.Errorf("non-integer x = %f", x)
}
x = round(x)
r = round(r)
b = round(b)
n = round(n)
if n < x || r < x || n-x > b {
return 0, nil
}
if n == 0 {
if x == 0 {
return 1, nil
}
return 0, nil
}
p := n / (r + b)
q := (r + b - n) / (r + b)
p1, err := dbinom(x, r, p, q, giveLog)
if err != nil {
return math.NaN(), err
}
p2, err := dbinom(n-x, b, p, q, giveLog)
if err != nil {
return math.NaN(), err
}
p3, err := dbinom(n, r+b, p, q, giveLog)
if err != nil {
return math.NaN(), err
}
if giveLog {
return p1 + p2 - p3, nil
}
return p1 * p2 / p3, nil
}
func Qhyper(p, NR, NB, n float64, lowerTail, logP bool) (float64, error) {
if notFinite(p) || notFinite(NR) || notFinite(NB) || notFinite(n) {
return math.NaN(), ErrDomain
}
NR = round(NR)
NB = round(NB)
N := NR + NB
n = round(n)
if NR < 0 || NB < 0 || n < 0 || n > N {
return math.NaN(), ErrDomain
}
/* Goal: Find xr (= #{red balls in sample}) such that
* phyper(xr, NR,NB, n) >= p > phyper(xr - 1, NR,NB, n)
*/
xstart := math.Max(0, n-NB)
xend := math.Min(n, NR)
if logP {
if p > 0 {
return math.NaN(), ErrDomain
}
if p == 0 { /* upper bound*/
if lowerTail {
return xend, nil
}
return xstart, nil
}
if math.IsInf(p, -1) {
if lowerTail {
return xstart, nil
}
return xend, nil
}
} else { /* !logP */
if p < 0 || p > 1 {
return math.NaN(), ErrDomain
}
if p == 0 {
if lowerTail {
return xstart, nil
}
return xend, nil
}
if p == 1 {
if lowerTail {
return xend, nil
}
return xstart, nil
}
}
xr := xstart
xb := n - xr /* always ( = #{black balls in sample} ) */
smallN := N < 1000 /* won't have underflow in product below */
/* if N is small, term := product.ratio( bin.coef );
otherwise work with its logarithm to protect against underflow */
t1, err := lfastchoose(NR, xr)
if err != nil {
return 0, err
}
t2, err := lfastchoose(NB, xb)
if err != nil {
return 0, err
}
t3, err := lfastchoose(N, n)
if err != nil {
return 0, err
}
term := t1 + t2 - t3
if smallN {
term = math.Exp(term)
}
NR -= xr
NB -= xb
if !lowerTail || logP {
p = qIv(p, lowerTail, logP)
}
p *= 1 - 1000*epsilon /* was 64, but failed on FreeBSD sometimes */
var sum float64
if smallN {
sum = term
} else {
sum = math.Exp(term)
}
for sum < p && xr < xend {
xr++
NB++
if smallN {
term *= (NR / xr) * (xb / NB)
} else {
term += math.Log((NR / xr) * (xb / NB))
}
if smallN {
sum += term
} else {
sum += math.Exp(term)
}
xb--
NR--
}
return xr, nil
}
func lfastchoose(n, k float64) (float64, error) {
lb, err := lbeta(n-k+1, k+1)
if err != nil {
return math.NaN(), err
}
return -math.Log(n+1) - lb, nil
}
func lbeta(a, b float64) (float64, error) {
p := a
q := a
if b < p {
p = b
} /* := min(a,b) */
if b > q {
q = b
} /* := max(a,b) */
/* both arguments must be >= 0 */
if p < 0 {
return math.NaN(), ErrDomain
} else if p == 0 {
return math.Inf(1), nil
} else if notFinite(q) { /* q == +Inf */
return math.Inf(1), nil
}
if p >= 10 {
/* p and q are big. */
corr := stirlerr(p) + stirlerr(q) - stirlerr(p+q)
return math.Log(q)*-0.5 + logSqrt2Pi + corr + (p-0.5)*math.Log(p/(p+q)) + q*math.Log1p(-p/(p+q)), nil
} else if q >= 10 {
/* p is small, but q is big. */
corr := stirlerr(q) - stirlerr(p+q)
return math.Gamma(p) + corr + p - p*math.Log(p+q) + (q-0.5)*math.Log1p(-p/(p+q)), nil
} else {
/* p and q are small: p <= q < 10. */
/* R change for very small args */
if p < min {
return lgamma(p) + (lgamma(q) - lgamma(p+q)), nil
}
}
return math.Log(math.Gamma(p) * (math.Gamma(q) / math.Gamma(p+q))), nil
}
func lgamma(p float64) float64 {
r, _ := math.Lgamma(p)
return r
}
func qIv(p float64, lowerTail, logP bool) float64 {
if logP {
if lowerTail {
return math.Exp(p)
}
return -math.Expm1(p)
}
if lowerTail {
return p
}
p = 0.5 - p
return p + 0.5
}
// Calculate
//
// phyper (x, r, b, n, TRUE, FALSE)
// [log] ----------------------------------
// dhyper (x, r, b, n, FALSE)
//
// without actually calling phyper. This assumes that
//
// x * (r + b) <= n * r
func pdhyper(x, r, b, n float64, logP bool) float64 {
sum := 0.
term := 1.
for x > 0 && term >= epsilon*sum {
term *= x * (b - n + x) / (n + 1 - x) / (r + 1 - x)
sum += term
x--
}
if logP {
return math.Log1p(sum)
}
return 1 + sum
}
var (
ln2 = math.Log(2)
ln2Pi = math.Log(2 * math.Pi)
)
func log(x float64, lowerTail bool) float64 {
if lowerTail {
return math.Log(x)
}
if x > -ln2 {
return math.Log(-math.Expm1(x))
}
return math.Log1p(-math.Exp(x))
}
func dbinom(x, n, p, q float64, giveLog bool) (float64, error) {
if p == 0 {
if x == 0 {
return 1, nil
}
return 0, nil
}
if q == 0 {
if x == n {
return 1, nil
}
return 0, nil
}
if x == 0 {
if n == 0 {
return 1, nil
}
if p < 0.1 {
t, err := bd0(n, n*q)
if err != nil {
return math.NaN(), err
}
return exp(-t-n*p, giveLog), nil
}
return exp(n*math.Log(q), giveLog), nil
}
if x == n {
if q < 0.1 {
t, err := bd0(n, n*p)
if err != nil {
return math.NaN(), err
}
return exp(-t-n*q, giveLog), nil
}
return exp(n*math.Log(p), giveLog), nil
}
if x < 0 || x > n {
return 0, nil
}
// n*p or n*q can underflow to zero if n and p or q are small. This
// used to occur in dbeta, and gives NaN as from R 2.3.0.
t1, err := bd0(x, n*p)
if err != nil {
return math.NaN(), err
}
t2, err := bd0(n-x, n*q)
if err != nil {
return math.NaN(), err
}
lc := stirlerr(n) - stirlerr(x) - stirlerr(n-x) - t1 - t2
// f = (M_2PI*x*(n-x))/n; could overflow or underflow
// Upto R 2.7.1:
// lf = log(M_2PI) + log(x) + log(n-x) - log(n);
// -- following is much better for x << n :
lf := ln2Pi + math.Log(x) + math.Log1p(-x/n)
return exp(lc-0.5*lf, giveLog), nil
}
func negativeOrNotInteger(x float64) bool {
return x < 0 || x != math.Floor(x)
}
func notFinite(x float64) bool {
return math.IsNaN(x) || math.IsInf(x, 0)
}
func round(x float64) float64 {
if _, frac := math.Modf(x); frac >= 0.5 {
return math.Ceil(x)
}
return math.Floor(x)
}
func exp(x float64, giveLog bool) float64 {
if giveLog {
return x
}
return math.Exp(x)
}
func dt0(lowerTail, logP bool) float64 {
if lowerTail {
return d0(logP)
}
return d1(logP)
}
func dt1(lowerTail, logP bool) float64 {
if lowerTail {
return d1(logP)
}
return d0(logP)
}
func d0(logP bool) float64 {
if logP {
return math.Inf(-1)
}
return 0
}
func d1(logP bool) float64 {
if logP {
return 0
}
return 1
}
// bd0(x,M) := M * D0(x/M) = M*[ x/M * log(x/M) + 1 - (x/M) ] =
// = x * log(x/M) + M - x
// where M = E[X] = n*p (or = lambda), for x, M > 0
//
// in a manner that should be stable (with small relative error)
// for all x and M=np. In particular for x/np close to 1, direct
// evaluation fails, and evaluation is based on the Taylor series
// of log((1+v)/(1-v)) with v = (x-M)/(x+M) = (x-np)/(x+np).
//
func bd0(x, np float64) (float64, error) {
if notFinite(x) || notFinite(np) || np == 0 {
return math.NaN(), ErrDomain
}
if math.Abs(x-np) < 0.1*(x+np) {
v := (x - np) / (x + np) // might underflow to 0
s := (x - np) * v // s using v -- change by MM
if math.Abs(s) < min {
return s, nil
}
ej := 2 * x * v
v = v * v
for j := 1; j < 1000; j++ {
// Taylor series; 1000: no infinite loop
// as |v| < .1, v^2000 is "zero"
ej *= v // = v^(2j+1)
s1 := s + ej/float64((j<<1)+1)
if s1 == s { // last term was effectively 0
return s1, nil
}
s = s1
}
}
/* else: | x - np | is not too small */
return x*math.Log(x/np) + np - x, nil
}
var (
// error for 0, 0.5, 1.0, 1.5, ..., 14.5, 15.0.
sfErrHalves = [31]float64{
0.0, // n=0 - wrong, place holder only
0.1534264097200273452913848, // 0.5
0.0810614667953272582196702, // 1.0
0.0548141210519176538961390, // 1.5
0.0413406959554092940938221, // 2.0
0.03316287351993628748511048, // 2.5
0.02767792568499833914878929, // 3.0
0.02374616365629749597132920, // 3.5
0.02079067210376509311152277, // 4.0
0.01848845053267318523077934, // 4.5
0.01664469118982119216319487, // 5.0
0.01513497322191737887351255, // 5.5
0.01387612882307074799874573, // 6.0
0.01281046524292022692424986, // 6.5
0.01189670994589177009505572, // 7.0
0.01110455975820691732662991, // 7.5
0.010411265261972096497478567, // 8.0
0.009799416126158803298389475, // 8.5
0.009255462182712732917728637, // 9.0
0.008768700134139385462952823, // 9.5
0.008330563433362871256469318, // 10.0
0.007934114564314020547248100, // 10.5
0.007573675487951840794972024, // 11.0
0.007244554301320383179543912, // 11.5
0.006942840107209529865664152, // 12.0
0.006665247032707682442354394, // 12.5
0.006408994188004207068439631, // 13.0
0.006171712263039457647532867, // 13.5
0.005951370112758847735624416, // 14.0
0.005746216513010115682023589, // 14.5
0.005554733551962801371038690, // 15.0
}
logSqrt2Pi = math.Log(math.Sqrt(2 * math.Pi))
)
// stirlerr(n) = log(n!) - log( sqrt(2*pi*n)*(n/e)^n )
// = log Gamma(n+1) - 1/2 * [log(2*pi) + log(n)] - n*[log(n) - 1]
// = log Gamma(n+1) - (n + 1/2) * log(n) + n - log(2*pi)/2
func stirlerr(n float64) float64 {
const (
S0 = 1. / 12.
S1 = 1. / 360.
S2 = 1. / 1260.
S3 = 1. / 1680.
S4 = 1. / 1188.
)
var nn float64
if n <= 15.0 {
nn = n + n
if nn == math.Floor(nn) {
return sfErrHalves[int(nn)]
}
lg, _ := math.Lgamma(n + 1)
return lg - (n+0.5)*math.Log(n) + n - logSqrt2Pi
}
nn = n * n
switch {
case n > 500:
return ((S0 - S1/nn) / n)
case n > 80:
return ((S0 - (S1-S2/nn)/nn) / n)
case n > 35:
return ((S0 - (S1-(S2-S3/nn)/nn)/nn) / n)
default: // 15 < n <= 35
return (S0 - (S1-(S2-(S3-S4/nn)/nn)/nn)/nn) / n
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With