The bootstrap algorithm basically can be proceeded as follows:
import numpy as np
import scipy as sp
from scipy import stats
x = np.array([7,19.8,12.8,6,15.2
,5.1,15,7.6], dtype='d')
print("Trimmed mean =", sp.stats.trim_mean(x, 0.25, axis=0))
Trimmed mean = 10.6
# 数据
x = np.array([7,19.8,12.8,6,15.2,5.1,15,7.6], dtype='d')
B = 10000 # 设置自助法的次数
np.random.seed(2020) # 设置随机种子取值
Results = np.zeros([B]) # 定义储存模拟结果变量
# 自助法循环
for b in range(B):
Bsample = np.random.choice(x, size=x.size, replace=True)
# 计算截尾均值
Results[b] = sp.stats.trim_mean(Bsample, 0.25, axis=0)
print(Results)
[ 6.25 7.15 8.1 ... 15.65 11.2 10.275]
Applying these formulae to our example:
mean = Results.mean()
var = (Results.std(ddof=1))**2
print("Bootstrap sample mean=%4.3f; Variance=%4.3f"
% (mean, var))
thetahat = sp.stats.trim_mean(x, 0.25, axis=0)
print("Bootstrap estimate=%4.3f; Bias-adjusted estimate=%4.3f"
% (thetahat, 2*thetahat - mean))
Bootstrap sample mean=10.651; Variance=6.431
Bootstrap estimate=10.600; Bias-adjusted estimate=10.549
partition() function to implement this:lower = np.partition(Results, 250-1)[250-1]
upper = -np.partition(-Results, 250-1)[250-1]
print("95 percent bootstrap CI is [%4.2f,%4.2f]:"
% (lower, upper))
95 percent bootstrap CI is [6.40,15.55]:
Ahe above code can be written in a more general way:
alpha = 10
num_obs = np.array([Results.size*(alpha*0.01/2)], dtype='i')
lower = np.partition(Results, num_obs-1)[num_obs-1]
upper = -np.partition(-Results, num_obs-1)[num_obs-1]
print("%d percent bootstrap CI is [%4.2f,%4.2f]:"
% (100-alpha,lower,upper)
90 percent bootstrap CI is [6.65,15.00]:
def bootstrap1(f, args, x, B=10000, seed=2020):
np.random.seed(seed)
Results = np.zeros([B])
for b in range(B):
Bsample = np.random.choice(x, size=x.size,
replace=True)
Results[b] = f(Bsample, *args)
return Results
def trim(Bsample, q):
return sp.stats.trim_mean(Bsample, q, axis=0)
def themean(Bsample):
return np.mean(Bsample)
def boot_est(f, args, x, Results):
mean = Results.mean()
var = 1/(Results.size-1) * np.sum((Results-mean)**2)
thetahat = f(x, *args)
thetahathat = 2*thetahat-mean
return mean, var, thetahathat
def boot_ci(Results, alpha=5):
obs = np.array([Results.size*(alpha*0.01/2)], dtype='i')
low = np.partition(Results, obs-1)[obs-1]
up = -np.partition(-Results, obs-1)[obs-1]
return low, up
quantile = 0.30
alpha_ci = 10
reptitions = 10000
x = np.array([7,19.8,12.8,6,15.2,5.1,15,7.6], dtype='d')
# 自助法样本
sample = bootstrap1(f=trim, args=(quantile,),
x=x, B=reptitions )
# 自助法平均值、方差、调整后的估计值
estimates = boot_est(f=trim, args=(quantile,),
x=x, Results=sample )
# 自助法置信区间
ci = boot_ci(sample, alpha=alpha_ci)
print("Bootstrap mean=%4.3f, Variance=%4.3f, Adjusted estimate=%4.3f" % estimates)
print("Bootstrap 90 percent CI = [%4.2f, %4.2f]" % ci)\
Bootstrap mean=10.651, Variance=6.431, Adjusted estimate=10.549
Bootstrap 90 percent CI = [6.65, 15.00]
class MyBootstrap:
def __init__(self, f, args, x, B=10000, seed=2020):
np.random.seed(seed)
self.Results = np.zeros([B])
for b in range(B):
self.Bsample = np.random.choice(x, size=x.size, replace=True)
self.Results[b] = f(self.Bsample, *args)
def boot_est(self, f, args, x, Results):
self.mean = Results.mean()
self.var = 1/(Results.size-1) * np.sum((Results-self.mean)**2)
thetahat = f(x, *args)
self.thetahathat = 2*thetahat - self.mean
return self.mean, self.var, self.thetahathat
def boot_ci(self, Results, alpha=5):
obs = np.array([Results.size*(alpha*0.01/2)], dtype='i')
self.low = np.partition(Results, obs-1)[obs-1]
self.up = -np.partition(-Results, obs-1)[obs-1]
return self.low, self.up
def trim(Bsample, trim):
return sp.stats.trim_mean(Bsample, trim, axis=0)
def themean(Bsample):
return np.mean(Bsample)
We first construct an empirical sample:
x = np.array([7,19.8,12.8,6,15.2,5.1,15,7.6], dtype='d')
quantile = 0.30
alpha_ci = 10
repetition = 10000
# 构建样本
boot = MyBootstrap(f=trim, args=(quantile,),
x=x, B=repetition)
boot.Results
array([ 6.25 , 7.15 , 8.1 , ..., 15.65 , 11.2 , 10.275])
Quantities of interest are saved in boot.boot_est and boot.boot_ci:
print("Bootstrap mean=%4.3f, Variance=%4.3f,
Adjusted estimate=%4.3f" % boot.boot_est(f=trim, args=(quantile,), x=x, Results=boot.Results))
print("Bootstrap 90 percent CI = [%4.2f, %4.2f]"
% boot.boot_ci(boot.Results, alpha=alpha_ci))
Bootstrap mean=10.651, Variance=6.431, Adjusted estimate=10.549
Bootstrap 90 percent CI = [6.65, 15.00]
import matplotlib.pyplot as plt
# 生成uniform分布的随机变量
u_rands = np.random.rand(10000)
fsamples = [-np.log(1-u) for u in u_rands]
f = lambda x: np.exp(-x)
plt.figure(figsize=(8,4))
plt.hist(fsamples, bins=30, alpha=0.3, density=True,
color='gray', label='Monte Carlo Sample')
plt.plot(np.linspace(0.001,15,1000),
f(np.linspace(0.001,15,1000)), lw=2,
color='k', label=r'Function of Interest: $f(x)$')
plt.title(r'Figure 9.1: Inversion Sampling', fontsize=14)
plt.xlabel(r'$x$', fontsize=12)
plt.ylabel(r'$y$', fontsize=12)
plt.legend(loc='best', fontsize=12)
plt.show()

where
Thus, one can consider the following algorithm to obtain the sample from
where
We first visualize the density of
from scipy.stats import norm
def f(x):
return 0.5*norm.pdf(x,loc=30,scale=10)+0.5*norm.pdf(x,loc=80,scale=20)
def g(x):
return norm.pdf(x,loc=50,scale=30)
plt.figure(figsize=(8,4))
plt.plot(np.linspace(-70,200,1000),
f(np.linspace(-70,200,1000)), color='k',
linestyle=":", label=r"Target density: $f(x)$")
plt.plot(np.linspace(-70,200,1000),
g(np.linspace(-70,200,1000)), color='k',
label=r"Proposal density: $g(x)$")
plt.title(r'Figure 9.2: $f(x)$ and $g(x)$',
fontsize=14)
plt.xlabel(r'$x$', fontsize=12)
plt.ylabel(r'$y$', fontsize=12)
plt.legend(loc='best', fontsize=12)
plt.show()

In our example,
# 使用最大比率查找M
x = np.arange(-50,151)
M = max(f(x) / g(x))
plt.figure(figsize=(8,5))
plt.plot(np.linspace(-70,200,1000),
f(np.linspace(-70,200,1000)), color='k',
linestyle=":", label=r"Target density: $f(x)$")
plt.plot(np.linspace(-70,200,1000),
M * g(np.linspace(-70,200,1000)), color='k',
label=r"Scaled proposal density: $M\times g(x)$")
plt.title(r'Figure 9.3: $f(x)$ and Scaled $g(x)$',
fontsize=14)
plt.xlabel(r'$x$', fontsize=12)
plt.ylabel(r'$y$', fontsize=12)
plt.legend(loc='upper right', fontsize=12)
plt.show()

We now code up the algorithm.
def rejection_sampling(iter=1000):
samples = []
for i in range(iter):
x = np.random.normal(50, 30)
u = np.random.uniform(0, M*g(x))
if u <= f(x):
samples.append(x)
return np.array(samples)
fsamples = rejection_sampling(iter=10000)

Consider
provided that
What if
from scipy.stats import beta
# f函数绘图
x = np.linspace(0.001,12,1000)
f = lambda x: np.exp( -(x-1)**2/2./x) * (x+1)/12
plt.figure(figsize=(8,4))
plt.plot(x, f(x), label=r'$f(x)$',
color='k',linestyle=":",)
# g函数绘图
g = beta(2,3)
plt.plot(x, g.pdf(x), label=r'$g(x)$', color='k')
plt.title(r'Figure 9.5: $f(x)$ and $g(x)$',fontsize=14)
plt.xlabel(r'$x$', fontsize=12)
plt.ylabel(r'$y$', fontsize=12)
plt.legend(fontsize=12)
plt.show()

g = beta(2,3)
plt.figure(figsize=(8,5))
plt.plot(x, f(x*15), label=r'$f(15x)$', color='k')
plt.plot(x, g.pdf(x), label=r'$g(x)$',
color='k',linestyle=":")
plt.title(r'Figure 9.6: $f(15x)$ and $g(x)$',fontsize=14)
plt.xlabel(r'$x$', fontsize=12)
plt.ylabel(r'$y$', fontsize=12)
plt.legend(fontsize=12)
plt.show()

# 从g中取样
xi = g.rvs(10000)
# 权重
w = np.array([f(i*15)/g.pdf(i) for i in xi])
# 从新函数中取样
fsamples = np.random.choice(xi*15,50000, p=w/w.sum())

Unlike parametric methods,
The histogram can be considered the crudest nonparametric method that estimates the underlying probability distribution of the data.
def make_data(N, f=0.3, seed=1):
rand = np.random.RandomState(seed)
x = rand.randn(N)
x[int(f*N):] += 5
return x
x = make_data(30)

Suppose that
The histogram has a probability density estimator of the form
where
is the fraction of data points (
Commonly used kernel functions:
Let us use a standard normal curve at each data point instead of a block:
from scipy.stats import norm
x_d = np.linspace(-4, 8, 2000)
density = sum(norm(xi).pdf(x_d) for xi in x)
plt.figure(figsize=(8,4))
plt.fill_between(x_d, density, alpha=0.3, color='gray',
label="Smoothed-out distribution")
plt.plot(x, np.full_like(x, -0.1), '^k', markeredgewidth=1,
label="Data points")
plt.title("Figure 9.9: Smoothed-Out Density", fontsize=14)
plt.ylabel("Frequency", fontsize=12)
plt.xlabel(r"$x$", fontsize=12)
plt.legend(loc="upper left", fontsize=12)
plt.show()

SciPy's gaussian_kde method can perform kernel density estimation using the Gaussian kernel.
from scipy.stats import gaussian_kde
kde = gaussian_kde(x, bw_method=0.5)

fit method to compute the parameters;predict method to evaluate the model.sklearn.neighbors.KernelDensity(*, bandwidth=1.0, kernel='gaussian').from sklearn.neighbors import KernelDensity
from scipy import stats
kde = KernelDensity(bandwidth=1.0,kernel='gaussian')
kde.fit(x[:, None])
score_samples() gives log of the probability density for a set of uniform data points (x_d).
logprob = kde.score_samples(x_d[:, None])
The next example demonstrates how the bandwidth affects the results:
def kde_sklearn(x, x_grid, h=1,k='gaussian'):
kde = KernelDensity(kernel=k,bandwidth=h)
# 应用数据x
kde.fit(x[:, None])
# 样本 x_d 取对数后的概率密度函数
logprob = kde.score_samples(x_d[:, None])
return np.exp(logprob)
plt.figure(figsize=(8,5))
plt.plot(x_d, kde_sklearn(x, x_d, h=1, k='epanechnikov'),
label='Epanechnikov kernel',
color='k', linewidth=2, linestyle="-")
plt.plot(x_d, kde_sklearn(x, x_d, h=1, k='tophat'),
label='Tophat kernel',
color='k', linewidth=2, linestyle="-.")
plt.plot(x_d, kde_sklearn(x, x_d, h=1, k='gaussian'),
label='Gaussian kernel',
color='k', linewidth=2, linestyle=":")
plt.hist(x, 10, weights=np.ones(len(x)) / len(x),
fc='gray', histtype='stepfilled',
alpha=0.2)
plt.plot(x, np.full_like(x, -0.01), '^k',
markeredgewidth=1)
plt.ylabel("Density", fontsize=12)
plt.xlabel(r"$x$", fontsize=12)
plt.legend(loc='best', fontsize=12)
plt.title("Figure 9.12: Distribution with Different Kernel Functions",
fontsize=14)
plt.show()

In
GridSearchCVWe will use GridSearchCV to optimize the bandwidth for the preceding dataset.
from sklearn.model_selection import GridSearchCV
bandwidths = 10 ** np.linspace(-1, 1, 100)
grid = GridSearchCV(estimator=KernelDensity(kernel='gaussian'),
param_grid ={'bandwidth': bandwidths},
cv=20, verbose=True )
grid.fit(x[:, None]);
Fitting 20 folds for each of 100 candidates, totalling 2000 fits
grid.best_params_
{'bandwidth': 0.8497534359086445}
GridSearchCVWe can choose kernels and bandwidth together.
bandwidths = 10 ** np.linspace(-1, 1, 100)
param_grid =[ {'bandwidth': bandwidths ,
'kernel': ('gaussian', 'cosine','tophat' ) }]
grid = GridSearchCV(estimator=KernelDensity(),
param_grid=param_grid, cv=20, verbose=True)
grid.fit(x[:, None])
grid.best_params_
Fitting 20 folds for each of 300 candidates, totalling 6000 fits
{'bandwidth': 1.484968262254465, 'kernel': 'tophat'}
The Bayes' rule gives the following formula for calculating the posterior probability
where
Recall the coin-flipping problem.
For the choice of prior for
The beta distribution
Below code plot the prior density of the different beta distribution shapes, and the binomial likelihood function with
import scipy as sp
import scipy.stats as st
plt.figure(figsize=(8, 5))
# X轴坐标
thetas = np.linspace(-0.25, 1.25, 200)
# Prior
prior = sp.stats.beta(6,6)
plt.plot(thetas, prior.pdf(thetas), label='Beta(6,6)',
c='k', linestyle="-.", alpha=0.8)
prior = sp.stats.beta(4,4)
plt.plot(thetas, prior.pdf(thetas), label='Beta(4,4)',
c='gray', linestyle="--")
prior = sp.stats.beta(10,10)
plt.plot(thetas, prior.pdf(thetas), label='Beta(10,10)',
c='gray', linestyle=":", alpha=0.5)
# Likelihood
n = 10
theta = 0.6
likelihood = sp.stats.binom(n=n, p=thetas)
plt.plot(thetas, n*likelihood.pmf(k=n*theta),
label='Likelihood', c='black', linestyle="-")
plt.title("Figure 9.14: Prior Densities and the Likelihood Function", fontsize=14)
plt.xlabel(r'$\theta$', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.legend(loc="upper left", fontsize=12)
plt.show()

We use SymPy to solve for the MAP estimate:
import sympy
from sympy import stats as stat
from sympy.abc import p,k,n
# 使用 sympy.expand_log 设置目标函数
obj=sympy.expand_log(sympy.log(p**k * (1-p)**(n-k) * stat.density(stat.Beta('p',6,6))(p)))
sol, = sympy.solve(sympy.simplify(sympy.diff(obj, p)), p)
sol
(k+5)/(k+10)
n = 10
theta = 0.6
rv = st.binom(n,theta)
mu = rv.mean()
# 定义先验分布
a, b = 6, 6
prior = st.beta(a, b)
# 定义后验分布
h = n*theta
posterior = st.beta(a+h, b+n-h)
# 计算MLE, MAP, HPD
theta_mle = mu/n
theta_map = (h+a-1)/(a+n+b-2)
hpdr = posterior.interval(0.95)
# 汇报结果
print("MLE=%4.3f; MAP=%4.3f; HPD is [%3.2f,%3.2f]"
% (theta_mle,theta_map,hpdr[0],hpdr[1]))
thetas = np.linspace(0, 1, 200)
plt.figure(figsize=(8, 5))
plt.plot(thetas, prior.pdf(thetas),
label='Prior', c='gray', alpha=0.5)
plt.plot(thetas, posterior.pdf(thetas),
label='Posterior',linestyle="--", c='gray')
plt.plot(thetas, n*st.binom(n, thetas).pmf(h),
label='Likelihood', c='k')
plt.axhline(0.3, hpdr[0], hpdr[1], c='black',
linewidth=2, linestyle=":", label='95% HPD region')
plt.xlim([0, 1])
plt.title("Figure 9.15: Prior, Likehood, and Posterior Distributions", fontsize=14)
plt.xlabel(r'$\theta$', fontsize=12)
plt.ylabel('Density Function', fontsize=12)
plt.legend(loc="best", fontsize=12)
plt.show()

Consider our example.
The Metropolis-Hastings algorithm:
Given an initial guess for
Consider the coin-flipping experiment again:
from scipy import stats as st
n = 100
p = 0.6
h = n*p
rv = st.binom(n, p)
mu = rv.mean()
# Prior
a, b = 6, 6
prior = st.beta(a, b)
# Likelihood
lik = st.binom
# 选取sigama值 (建议分布)
sigma = 0.25
def target(lik, prior, n, h, theta):
if theta < 0 or theta > 1:
return 0
else:
return lik(n, theta).pmf(h)*prior.pdf(theta)
# MCMC algorithm
def mh_coin(niters, n, h, theta, lik, prior, sigma, seed=100):
np.random.seed(seed)
samples = [theta]
while len(samples) < niters:
theta_p = theta + st.norm(0, sigma).rvs()
rho = min(1, target(lik, prior, n, h, theta_p)/target(lik, prior, n, h, theta))
u = np.random.uniform()
if u < rho:
theta = theta_p
samples.append(theta)
return samples
theta = 0.1 # theta 的初始猜测值
niters = 2000 # MCMC 迭代次数
samples = mh_coin(niters, n, h, theta, lik, prior, sigma)
Generates the trace plot for the first 150 iterations
plt.figure(figsize=(11, 5))
plt.plot(samples, '-ok', alpha=1)
plt.xlim([0, 150])
plt.title("Figure 9.16: Trace Plot of MCMC Sample",
fontsize=14)
plt.ylabel(r'$\theta$', fontsize=12)
plt.xlabel('The number of MCMC iterations', fontsize=12)
plt.show()
Consider multiple chains with different starting values of
samples2 = [mh_coin(niters, n, h, theta, lik, prior, sigma)
for theta in np.arange(0.1, 1.1, 0.25)]
# 轨迹图
plt.figure(figsize=(11, 5))
alpha=0.1
for samples in samples2:
plt.plot(samples, '-', alpha=alpha, color='k')
alpha +=0.2
plt.xlim([0, 150])
plt.ylim([0, 1])
plt.title("Figure 9.17: Trace Plot using Multiple Chains",
fontsize=14)
plt.ylabel(r'$\theta$', fontsize=12)
plt.xlabel('The number of MCMC iterations', fontsize=12)
plt.show()

We compute the posterior quantities by:
# 去除后十分之九的MCMC样本
nmcmc = len(samples)//10
# 后验均值 & SD
print("Posterior mean=%4.3f \nPosterior SD=%4.3f"
% (np.mean(samples[nmcmc:]),np.std(samples[nmcmc:])) )
Posterior mean=0.589
Posterior SD=0.047
The general procedure:
With the Bayesian approach, the joint posterior distribution parameter
Consider the likelihood,
Assume
Assume a uniform prior over the real line for
If we transform the uniform prior on
The expression for the posterior reduces to:
data_wageedu.csv) to model the return of years of education on wage.Gibbs sampling algorithm:
We run the Gibbs sampler for 10,000 iterations. We drop the first 10,000 draws for burn-in and use the rest draws for computing the posterior mean as the estimate of each parameter.
import pandas as pd
from scipy.linalg import cholesky
# 导入数据
data = pd.read_csv("dependency/data_wageedu.csv")
x = data.loc[:,"edu"].to_numpy(dtype='float', na_value=np.nan).reshape(-1,1)
y = data.loc[:,"logwage"].to_numpy(dtype='float', na_value=np.nan).reshape(-1,1)
X = np.append(np.ones((y.size,1)), x, axis=1)
n, p = X.shape
M = 10000 # 循环次数
burnin = 1000 # Burn-in样本
prng = np.random.RandomState(516)
betas, sigma2 = np.zeros([M, p]), np.ones(M)
# 利用OLS估计作为初始值
b_ols = np.linalg.inv(X.T@X)@X.T@y
V = np.linalg.inv(X.T @ X)
sigma2[0] = (y-X@b_ols).T @ (y-X@b_ols)/(n-p)
betas[0,:] = b_ols.T
# 吉布斯采样算法
for ii in range(1, M):
# 从beta的条件后验采样
betas[ii,:] = b_ols.T + prng.randn(p).reshape(1,-1)
@ cholesky(sigma2[ii - 1] * V)
# 从sigma2的条件后验采样
sigma2[ii] = st.invgamma.rvs(
a=0.5*n,
scale=0.5*((y-X@betas[ii,:].reshape(-1,1)).
T @ (y-X@betas[ii,:].reshape(-1,1))).item(),
random_state=prng)
betas = betas[burnin:,:]
sigma2 = sigma2[burnin:]
print("-- OLS estimates --")
print("Beta est.", betas[0, :])
print("Variance est.", sigma2[0])
print("-- Bayesian estimates --")
print("Beta est.", betas.mean(axis=0))
print("Variance est.", sigma2.mean())
-- OLS estimates --
Beta est. [1.76422052 0.07150643]
Variance est. 0.018974787800205593
-- Bayesian estimates --
Beta est. [1.77606092 0.06987681]
Variance est. 0.01855152164249711
We use flat priors so that we can find the Bayesian point estimates are quite close to the OLS point estimates.