p-values#
Links: notebook
, html, PDF
, python
, slides, GitHub
Compute p-values and heavy tails estimators.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
%matplotlib inline
p-value table#
from scipy.stats import norm
import pandas
from pandas import DataFrame
import numpy
def pvalue(p, q, N):
theta = abs(p-q)
var = p*(1-p)
bn = (2*N)**0.5 * theta / var**0.5
ret = (1 - norm.cdf(bn))*2
return ret
def pvalue_N(p, q, alpha):
theta = abs(p-q)
var = p*(1-p)
rev = abs(norm.ppf (alpha/2))
N = 2 * (rev * var**0.5 / theta)** 2
return int(N+1)
def alphatable(ps, dps, alpha):
values = []
for p in ps :
row=[]
for dp in dps :
q = p+dp
r = pvalue_N(p,q,alpha) if 1 >= q >= 0 else numpy.nan
row.append (r)
values.append (row)
return values
def dataframe(ps,dps,table):
columns = dps
df = pandas.DataFrame(data=table, index=ps)
df.columns = dps
return df
print ("norm.ppf(0.025)",norm.ppf (0.025)) # -1.9599639845400545
ps = [0.001, 0.002] + [ 0.05*i for i in range (1,20) ]
dps = [ -0.2, -0.1, -0.02, -0.01, -0.002, -0.001,
0.2, 0.1, 0.02, 0.01, 0.002, 0.001, ]
dps.sort()
t = alphatable(ps, dps, 0.05)
dataframe (ps, dps, t)
norm.ppf(0.025) -1.9599639845400545
-0.200 | -0.100 | -0.020 | -0.010 | -0.002 | -0.001 | 0.001 | 0.002 | 0.010 | 0.020 | 0.100 | 0.200 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0.001 | NaN | NaN | NaN | NaN | NaN | 7676 | 7676 | 1919 | 77 | 20 | 1.0 | 1.0 |
0.002 | NaN | NaN | NaN | NaN | 3834.0 | 15336 | 15336 | 3834 | 154 | 39 | 2.0 | 1.0 |
0.050 | NaN | NaN | 913.0 | 3650.0 | 91235.0 | 364939 | 364939 | 91235 | 3650 | 913 | 37.0 | 10.0 |
0.100 | NaN | 70.0 | 1729.0 | 6915.0 | 172866.0 | 691463 | 691463 | 172866 | 6915 | 1729 | 70.0 | 18.0 |
0.150 | NaN | 98.0 | 2449.0 | 9796.0 | 244893.0 | 979572 | 979572 | 244893 | 9796 | 2449 | 98.0 | 25.0 |
0.200 | 31.0 | 123.0 | 3074.0 | 12293.0 | 307317.0 | 1229267 | 1229267 | 307317 | 12293 | 3074 | 123.0 | 31.0 |
0.250 | 37.0 | 145.0 | 3602.0 | 14406.0 | 360137.0 | 1440548 | 1440548 | 360137 | 14406 | 3602 | 145.0 | 37.0 |
0.300 | 41.0 | 162.0 | 4034.0 | 16135.0 | 403354.0 | 1613413 | 1613413 | 403354 | 16135 | 4034 | 162.0 | 41.0 |
0.350 | 44.0 | 175.0 | 4370.0 | 17479.0 | 436966.0 | 1747864 | 1747864 | 436966 | 17479 | 4370 | 175.0 | 44.0 |
0.400 | 47.0 | 185.0 | 4610.0 | 18440.0 | 460976.0 | 1843901 | 1843901 | 460976 | 18440 | 4610 | 185.0 | 47.0 |
0.450 | 48.0 | 191.0 | 4754.0 | 19016.0 | 475381.0 | 1901523 | 1901523 | 475381 | 19016 | 4754 | 191.0 | 48.0 |
0.500 | 49.0 | 193.0 | 4802.0 | 19208.0 | 480183.0 | 1920730 | 1920730 | 480183 | 19208 | 4802 | 193.0 | 49.0 |
0.550 | 48.0 | 191.0 | 4754.0 | 19016.0 | 475381.0 | 1901523 | 1901523 | 475381 | 19016 | 4754 | 191.0 | 48.0 |
0.600 | 47.0 | 185.0 | 4610.0 | 18440.0 | 460976.0 | 1843901 | 1843901 | 460976 | 18440 | 4610 | 185.0 | 47.0 |
0.650 | 44.0 | 175.0 | 4370.0 | 17479.0 | 436966.0 | 1747864 | 1747864 | 436966 | 17479 | 4370 | 175.0 | 44.0 |
0.700 | 41.0 | 162.0 | 4034.0 | 16135.0 | 403354.0 | 1613413 | 1613413 | 403354 | 16135 | 4034 | 162.0 | 41.0 |
0.750 | 37.0 | 145.0 | 3602.0 | 14406.0 | 360137.0 | 1440548 | 1440548 | 360137 | 14406 | 3602 | 145.0 | 37.0 |
0.800 | 31.0 | 123.0 | 3074.0 | 12293.0 | 307317.0 | 1229267 | 1229267 | 307317 | 12293 | 3074 | 123.0 | 31.0 |
0.850 | 25.0 | 98.0 | 2449.0 | 9796.0 | 244893.0 | 979572 | 979572 | 244893 | 9796 | 2449 | 98.0 | NaN |
0.900 | 18.0 | 70.0 | 1729.0 | 6915.0 | 172866.0 | 691463 | 691463 | 172866 | 6915 | 1729 | 70.0 | NaN |
0.950 | 10.0 | 37.0 | 913.0 | 3650.0 | 91235.0 | 364939 | 364939 | 91235 | 3650 | 913 | NaN | NaN |
p-values in 2D#
import numpy, matplotlib, random, math
import matplotlib.pyplot as pylab
def matrix_square_root(sigma) :
eigen, vect = numpy.linalg.eig(sigma)
dim = len(sigma)
res = numpy.identity(dim)
for i in range(0,dim) :
res[i,i] = eigen[i]**0.5
return vect * res * vect.transpose()
def chi2_level (alpha = 0.95) :
N = 1000
x = [ random.gauss(0,1) for _ in range(0,N) ]
y = [ random.gauss(0,1) for _ in range(0,N) ]
r = map ( lambda c : (c[0]**2+c[1]**2)**0.5, zip(x,y))
r = list(r)
r.sort()
res = r [ int (alpha * N) ]
return res
def square_figure(mat, a) :
x = [ ]
y = [ ]
for i in range (0,100) :
x.append( a * mat[0][0]**0.5 )
y.append( (random.random ()-0.5) * a * mat[1][1]**0.5*2 )
x.append( -a * mat[0][0]**0.5 )
y.append( (random.random ()-0.5) * a * mat[1][1]**0.5*2 )
y.append( a * mat[1][1]**0.5 )
x.append( (random.random ()-0.5) * a * mat[0][0]**0.5*2 )
y.append( -a * mat[1][1]**0.5 )
x.append( (random.random ()-0.5) * a * mat[0][0]**0.5*2 )
pylab.plot(x,y, 'ro')
x = [ ]
y = [ ]
for i in range (0,100) :
x.append( a )
y.append( (random.random ()-0.5) * a*2 )
x.append( -a )
y.append( (random.random ()-0.5) * a*2 )
y.append( a )
x.append( (random.random ()-0.5) * a*2 )
y.append( -a )
x.append( (random.random ()-0.5) * a*2 )
xs,ys = [],[]
for a,b in zip (x,y) :
ar = numpy.matrix( [ [a], [b] ] ).transpose()
we = ar * root
xs.append( we [0,0] )
ys.append( we [0,1] )
pylab.plot(xs,ys, 'bo')
pylab.show()
def circle_figure (mat, a) :
x = [ ]
y = [ ]
for i in range (0,200) :
z = random.random() * math.pi * 2
i = a * mat[0][0]**0.5 * math.cos(z)
j = a * mat[0][0]**0.5 * math.sin(z)
x.append ( i )
y.append ( j )
pylab.plot(x,y, 'ro')
x = [ ]
y = [ ]
for i in range (0,200) :
z = random.random() * math.pi * 2
i = a * math.cos(z)
j = a * math.sin(z)
x.append ( i )
y.append ( j )
xs,ys = [],[]
for a,b in zip (x,y) :
ar = numpy.matrix( [ [a], [b] ] ).transpose()
we = ar * root
xs.append( we [0,0] )
ys.append( we [0,1] )
pylab.plot(xs,ys, 'bo')
pylab.show()
level = chi2_level ()
mat = [ [0.1, 0.05], [0.05, 0.2] ]
npmat = numpy.matrix(mat)
root = matrix_square_root (npmat)
square_figure (mat, 1.96)
circle_figure (mat, level)


p-value ratio#
import random, math
def densite_gauss (mu, sigma, x) :
e = -(x - mu)**2 / (sigma**2 * 2)
d = 1. / ((2*math.pi)**0.5 * sigma)
return d * math.exp (e)
def simulation_vector (N, mu, sigma) :
return [ random.gauss(mu,sigma) for n in range(N) ]
def ratio (vector, x, fdensite) :
under = 0
above = 0
fx = fdensite(x)
for u in vector:
f = fdensite (u)
if f >= fx:
above += 1
else:
under += 1
return float(above) / float (above + under)
x = 1.96
N = 10000
mu = 0
sigma = 1
v = simulation_vector(N, mu, sigma)
g = ratio(v, x, lambda y: densite_gauss (mu, sigma, y) )
print (g)
0.9487
p-values and EM#
See Applying the EM Algorithm: Binomial Mixtures.
from scipy.stats import norm
import random, math
def average_std_deviation(sample):
mean = 0.
var = 0.
for x in sample:
mean += x
var += x*x
mean /= len(sample)
var /= len(sample)
var -= mean*mean
return mean,var ** 0.5
def bootsample(sample):
n = len(sample)-1
return [ sample[ random.randint(0,n) ] for _ in sample ]
def bootstrap_difference(sampleX, sampleY, draws=2000, confidence=0.05):
diff = [ ]
for n in range (0,draws) :
if n % 1000 == 0:
print(n)
sx = bootsample(sampleX)
sy = bootsample(sampleY)
px = sum(sx) * 1.0/ len(sx)
py = sum(sy) * 1.0/ len(sy)
diff.append (px-py)
diff.sort()
n = int(len(diff) * confidence / 2)
av = sum(diff) / len(diff)
return av, diff [n], diff [len(diff)-n]
# generation of a sample
def generate_obs(p):
x = random.random()
if x <= p : return 1
else : return 0
def generate_n_obs(p, n):
return [ generate_obs(p) for i in range (0,n) ]
# std deviation
def diff_std_deviation(px, py):
s = px*(1-px) + py*(1-py)
return px, py, s**0.5
def pvalue(diff, std, N):
theta = abs(diff)
bn = (2*N)**0.5 * theta / std
pv = (1 - norm.cdf(bn))*2
return pv
def omega_i (X, pi, p, q) :
np = p * pi if X == 1 else (1-p)*pi
nq = q * (1-pi) if X == 1 else (1-q)*(1-pi)
return np / (np + nq)
def likelihood (X, pi, p, q) :
np = p * pi if X == 1 else (1-p)*pi
nq = q * (1-pi) if X == 1 else (1-q)*(1-pi)
return math.log(np) + math.log(nq)
def algoEM (sample):
p = random.random()
q = random.random()
pi = random.random()
iter = 0
while iter < 10 :
lk = sum ( [ likelihood (x, pi, p, q) for x in sample ] )
wi = [ omega_i (x, pi, p, q) for x in sample ]
sw = sum(wi)
pin = sum(wi) / len(wi)
pn = sum([ x * w for x,w in zip (sample,wi) ]) / sw
qn = sum([ x * (1-w) for x,w in zip (sample,wi) ]) / (len(wi) - sw)
pi,p,q = pin,pn,qn
iter += 1
lk = sum ( [ likelihood (x, pi, p, q) for x in sample ] )
return pi,p,q, lk
# mix
p,q = 0.20, 0.80
pi = 0.7
N = 1000
na = int(N * pi)
nb = N - na
print("------- sample")
sampleX = generate_n_obs(p, na) + generate_n_obs (q, nb)
random.shuffle(sampleX)
print("ave", p * pi + q*(1-pi))
print("mea", sum(sampleX)*1./len(sampleX))
lk = sum ( [ likelihood (x, pi, p, q) for x in sampleX ] )
print ("min lk", lk, sum (sampleX)*1. / len(sampleX))
res = []
for k in range (0, 10) :
r = algoEM (sampleX)
res.append ( (r[-1], r) )
res.sort ()
rows = []
for r in res:
pi,p,q,lk = r[1]
rows.append( [p * pi + q*(1-pi)] + list(r[1]))
df = pandas.DataFrame(data=rows)
df.columns = ["average", "pi", "p", "q", "likelihood"]
df
------- sample
ave 0.38
mea 0.373
min lk -3393.2292120130046 0.373
average | pi | p | q | likelihood | |
---|---|---|---|---|---|
0 | 0.373 | 0.000324 | 0.341877 | 0.373010 | -9358.705695 |
1 | 0.373 | 0.863747 | 0.284788 | 0.932204 | -4531.967709 |
2 | 0.373 | 0.936083 | 0.346101 | 0.766941 | -4490.512057 |
3 | 0.373 | 0.123023 | 0.290964 | 0.384508 | -3563.557269 |
4 | 0.373 | 0.538835 | 0.053584 | 0.746213 | -3487.438442 |
5 | 0.373 | 0.346351 | 0.057880 | 0.539974 | -3302.391944 |
6 | 0.373 | 0.797540 | 0.376491 | 0.359248 | -3144.938682 |
7 | 0.373 | 0.392520 | 0.592563 | 0.231131 | -2902.915478 |
8 | 0.373 | 0.390241 | 0.459488 | 0.317648 | -2778.903072 |
9 | 0.373 | 0.609127 | 0.338062 | 0.427447 | -2764.987703 |
p-value and heavy tail#
from scipy.stats import norm, zipf
import sys
def generate_n_obs_zipf (tail_index, n) :
return list(zipf.rvs(tail_index, size=n))
def hill_estimator (sample) :
sample = list(sample)
sample.sort(reverse=True)
end = len(sample)/10
end = min(end,100)
s = 0.
res = []
for k in range (0,end) :
s += math.log(sample[k])
h = (s - (k+1)*math.log(sample[k+1]))/(k+1)
h = 1./h
res.append( [k, h] )
return res
# mix
tail_index = 1.05
N = 10000
sample = generate_n_obs_zipf(tail_index, N)
sample[:5]
[357621, 148, 18, 1812876449, 36150]
import pandas
def graph_XY(curves, xlabel=None, ylabel=None, marker=True,
link_point=False, title=None, format_date="%Y-%m-%d",
legend_loc=0, figsize=None, ax=None):
if ax is None:
import matplotlib.pyplot as plt # pylint: disable=C0415
fig, ax = plt.subplots(1, 1, figsize=figsize)
smarker = {(True, True): 'o-', (True, False): 'o', (False, True): '-',
# (False, False) :''
}[marker, link_point]
has_date = False
for xf, yf, label in curves:
ax.plot(xf, yf, smarker, label=label)
ax.legend(loc=legend_loc)
return ax
def draw_variance(sample) :
avg = 0.
std = 0.
n = 0.
w = 1.
add = []
for i,x in enumerate(sample) :
x = float (x)
avg += x * w
std += x*x * w
n += w
val = (std/n - (avg/n)**2)**0.5
add.append ( [ i, avg/n, val] )
print(add[-1])
table = pandas.DataFrame(add, columns=["index", "avg(n)", "std(n)"])
return graph_XY([
[table['index'], table["avg(n)"], "avg(n)"],
[table['index'], table["std(n)"], "std(n)"],
], marker=False, link_point=True)
draw_variance(sample);
[9999, 55186871.0339, 233342554.46156308]

def draw_hill_estimator (sample) :
res = hill_estimator(sample)
table = DataFrame(res, columns=["d", "hill"])
return graph_XY(
[[table['d'], table['hill'], "Hill"],],
marker=False, link_point=True)
draw_hill_estimator(sample);

def draw_heavy_tail (sample) :
table = DataFrame([[_] for _ in sample ], columns=["obs"])
std = 1
normal = norm.rvs(size = len(sample))
normal = [ x*std for x in normal ]
nortbl = DataFrame([ [_] for _ in normal ], columns=["obs"])
nortbl["iobs"] = (nortbl['obs'] * 10).astype(numpy.int64)
histon = nortbl[["iobs", "obs"]].groupby('iobs', as_index=False).count()
histon.columns = ['iobs', 'nb']
histon = histon.sort_values("nb", ascending=False).reset_index(drop=True)
table["one"] = 1
histo = table.groupby('obs', as_index=False).count()
histo.columns = ['obs', 'nb']
histo = histo.sort_values('nb', ascending=False).reset_index(drop=True)
histo.reset_index(drop=True, inplace=True)
histo["index"] = histo.index + 1
vec = list(histon["nb"])
vec += [0,] * len(histo)
histo['nb_normal'] = vec[:len(histo)]
histo["log(index)"] = numpy.log(histo["index"]) / numpy.log(10)
histo["log(nb)"] = numpy.log(histo["nb"]) / numpy.log(10)
histo["log(nb_normal)"] = numpy.log(histo["nb_normal"]) / numpy.log(10)
return graph_XY ([
[histo["log(index)"], histo["log(nb)"], "Zipf"],
[histo["log(index)"], histo["log(nb_normal)"], "Gauss"], ],
marker=False, link_point=True)
draw_heavy_tail(sample);
c:python372_x64libsite-packagespandascoreseries.py:679: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
