# 第五章 线性回归模型及内生性问题 --- Linear Regression Models & Endogeneity Issues
#(《计量经济学编程:以Python语言为工具》严子中、张毅著;中国财经出版社,2024)
import pandas as pd
import numpy as np
# 对应目录下从data_wageedu.dta中读取数据
data = pd.pandas.read_stata("dependency/data_incomeedu.dta")
x = data.loc[:,"educ"].to_numpy(dtype='float',
na_value=np.nan).reshape(-1,1)
y = data.loc[:,"income"].to_numpy(dtype='float',
na_value=np.nan).reshape(-1,1)
y = np.log(y)
X = np.append(np.ones((y.size,1)), x, axis=1)
import time
# 方法一:循环计算
def forloopols(x, y):
x_bar = x.mean()
y_bar = y.mean()
denominator = 0
numerator = 0
for i in range(y.size):
denominator += (x[i]-x_bar)**2
numerator += (x[i]-x_bar)*(y[i]-y_bar)
return numerator/denominator
# 方法二:矩阵计算
def matols(x, y):
X = np.append(np.ones((y.size,1)),x,axis=1)
return np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)), X.T), y)[1]
startTime = time.time() # 计时开始
for k in range(1000):
slope_v1, = forloopols(x, y)
endTime = time.time()-startTime # 计时结束
startTime = time.time()
for k in range(1000):
slope_v2, = matols(x, y)
faster_rate = endTime/(time.time()-startTime)
# 对比循环与矩阵方法的用时差别
print("beta2 is estimated to be %5.3f by for-loop"
% (slope_v1))
print("beta2 is estimated to be %5.3f by matrix computation"
% (slope_v2))
print("Matrix computation is ", faster_rate," times faster.")
beta2 is estimated to be 0.039 by for-loop beta2 is estimated to be 0.039 by matrix computation Matrix computation is 28.338495326625466 times faster.
def timer_ols(method, rep_time=500000):
startTime = time.time()
if method == "dot":
for ii in range(rep_time):
np.dot(np.dot(np.linalg.inv(np.dot(X.T,X)), X.T), y)
print("np.dot() takes %6.5f seconds"
% (time.time()-startTime) )
elif method == "matmul":
for ii in range(rep_time):
np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),
X.T), y)
print("np.matmul() takes %6.5f seconds"
% (time.time()-startTime))
elif method == "at":
for ii in range(rep_time):
(np.linalg.inv(X.T @ X) @ X.T) @ y
print("@ takes %6.5f seconds"
% (time.time()-startTime))
# 执行
timer_ols("dot")
timer_ols("matmul")
timer_ols("at")
np.dot() takes 4.82326 seconds np.matmul() takes 4.69728 seconds @ takes 4.72106 seconds
import statsmodels.api as sm
# 读取数据并且添加常数项列
X = sm.add_constant(x)
# 采用回归方程进行数据输入并进行拟合以及估计
result_ols = sm.OLS(y, X).fit()
print(result_ols.summary2())
OLS Regression Results
=========================================================================
Dep. Variable: y R-squared: 0.041
Model: OLS Adj. R-squared: 0.032
Method: Least Squares F-statistic: 4.778
Date: Sun, 21 May 2023 Prob (F-statistic): 0.0309
Time: 21:30:58 Log-Likelihood: -96.436
No. Observations: 115 AIC: 196.9
Df Residuals: 113 BIC: 202.4
Df Model: 1
Covariance Type: nonrobust
=========================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------
const 9.9957 0.159 62.705 0.000 9.680 10.312
x1 0.0392 0.018 2.186 0.031 0.004 0.075
=========================================================================
Omnibus: 94.673 Durbin-Watson: 1.053
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1055.552
Skew: 2.642 Prob(JB): 6.16e-230
Kurtosis: 16.870 Cond. No. 27.2
=========================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is
correctly specified.
import matplotlib.pyplot as plt
beta = np.linalg.inv((X.T @ X))@ X.T @ y
plt.figure(figsize=(8,4))
plt.plot(x, y, 'ok', label='Original data',
markersize=8, alpha=0.4)
plt.plot(x, beta[0] + beta[1]*x, 'gray', label='Fitted line')
plt.title("Figure 5.1: The OLS Fitted Line and the Original Data",
fontsize=14)
plt.xlabel(r"$x$", fontsize=12)
plt.ylabel(r"$y$", fontsize=12)
plt.legend(fontsize=12)
plt.show()
N = y.size
# 利用OLS表达式估计参数的值
beta = np.dot(np.dot(np.linalg.inv(np.dot(X.T,X)), X.T), y)
# 计算残差
e = y-np.dot(X,beta)
# 计算sigma平方的估计量
sigma2 = (1/(N-2)) * np.inner(e.T,e.T)
print("Estimate of sigma2:",sigma2[0])
# 计算OLS估计量方差矩阵
var = np.linalg.inv(np.dot(X.T,X))*sigma2
print("Variance-covariance matrix \n", var)
# 计算OLS估计量标准误
se = np.sqrt(np.diag(var))
print("SEs of beta0 is %6.3f and of beta1 is %6.3f"
% (se[0],se[1]))
Estimate of sigma2: [0.31879951] Variance-covariance matrix [[ 0.02541155 -0.00269516] [-0.00269516 0.00032085]] SEs of beta0 is 0.159 and of beta1 is 0.018
n = y.size
k = X.shape[1]
M0 = np.eye(n) - np.ones((n,n))/n
# 计算RSS与TSS
RSS = e.T@e
TSS = y.T@M0@y
# 计算R2与adj. R2
R2 = 1 - RSS/TSS
adjR2 = 1- (RSS/(n-k))/(TSS/(n-1))
print("R-squared: %5.4f" % (R2))
print("Adj. R-squared: %5.4f" % (adjR2))
R-squared: 0.0406 Adj. R-squared: 0.0321
print("t-stat of beta1: %4.3f"
% ((beta.reshape(-1)/se)[1]) )
t-stat of beta1: 2.186
from scipy.stats import t
cv = t.ppf(0.975, df=N-2) # 临界值
lower_bounds = beta.reshape(-1) - se*cv
upper_bounds = beta.reshape(-1) + se*cv
print("95 CIs of beta1: [%5.3f, %5.3f]" % (lower_bounds[1],upper_bounds[1]))
95 CIs of beta1: [0.004, 0.075]
from scipy.stats import norm
# 计算S_x与S_ex
Sx = (X.T@X)/n
Q = np.diag(e.reshape(-1))**2
Sex = (X.T@Q@X)/n
Avar = np.linalg.inv(Sx)@Sex@np.linalg.inv(Sx)/n
Ase = np.sqrt(np.diag(Avar))
zstat = beta.reshape(-1)/Ase
print("z-stat beta1 is %4.3f" % (zstat[1]) )
cv = norm.ppf(0.975) # 临界值
lower_bounds = beta.reshape(-1) - Ase*cv
upper_bounds = beta.reshape(-1) + Ase*cv
print("95 asy. CIs of beta1 is [%5.3f,%5.3f]"
% (lower_bounds[1],upper_bounds[1]))
z-stat beta1 is 2.223 95 asy. CIs of beta1 is [0.005,0.074]
R = [[0, 1]]
print(result_ols.f_test(R))
R = [[1, -1]]
print(result_ols.f_test(R))
from statsmodels.stats.diagnostic import het_breuschpagan, het_white
model = sm.OLS(y, X).fit()
bp_result = het_breuschpagan(model.resid, X)
white_result = het_white(model.resid, X)
print('BP test p-value: '+str(bp_result[3]))
print('White test p-value: '+str(white_result[3]))
import seaborn as sns
sigma2_e = np.sqrt(np.sum(e**2)/(N-2))
r = e/sigma2_e
# 作图
reference = np.random.normal(size=200000)
plt.figure(figsize=(8,4))
sns.kdeplot(reference, color='black', lw=2,
label=r"$N(0,1)$ PDF", linestyle="--")
sns.histplot(r, kde=True, linewidth=0, stat='density',
color='k', alpha=0.6,
label="Histogram of standardized residuals")
plt.title("Figure 5.2: Visualization the Standardized Residuals",
fontsize=14)
plt.ylabel("density", fontsize=12)
plt.xlabel(r"$r_i$", fontsize=12)
plt.legend(loc="best", fontsize=12)
plt.show()
from scipy.stats import normaltest
k2, p = normaltest(r)
print("K2 statistic: %5.4f \np-value: %5.4f" % (k2,p))
K2 statistic: 94.6726 p-value: 0.0000
omega_hat = (e@e.T)*n/(n-k)
# 计算稳健方差矩阵
XXinv = np.linalg.inv(X.T@X)
var = XXinv@X.T@np.diag(np.diag(omega_hat))@X@XXinv
print("White variance-covariance matrix \n", var)
# 计算稳健标准误
se = np.sqrt(np.diag(var))
print("White SEs of beta0 is %6.5f and of beta1 is %6.5f" % (se[0],se[1]))
White variance-covariance matrix [[ 0.02255978 -0.00250354] [-0.00250354 0.00031564]] White SEs of beta0 is 0.15020 and of beta1 is 0.01777
# 配置Python-Stata交互
import os
os.chdir('/Applications/Stata/utilities')
from pystata import config
config.init('mp')
# 将数据传输至Stata
from pystata import stata
stata.pdataframe_to_data(data, force=True)
# 回归并使用聚类标准误
stata.run(
'''gen lincome = ln(income)
regress lincome educ, cluster(village_pid)
''')
Linear regression Number of obs = 115
F(1, 7) = 9.80
Prob > F = 0.0166
R-squared = 0.0406
Root MSE = .56462
(Std. err. adjusted for 8 clusters in village_pid)
---------------------------------------------------------------------
| Robust
lincome | Coefficient std. err. t P>|t| [95% conf. interval]
--------+------------------------------------------------------------
educ | .0391548 .0125084 3.13 0.017 .0095772 .0687324
_cons | 9.995728 .1576285 63.41 0.000 9.622995 10.36846
---------------------------------------------------------------------
# 从数据中获取cluster变量
cluster = data.loc[:,"village_pid"].to_numpy(dtype='float',na_value=np.nan).reshape(-1,1)
# 利用np.unique获取G的数值
G = len(np.unique(cluster))
# 计算聚类方差的Omega
omega_hat = (e@e.T)*((G)/(G-1))*((n-1)/(n-k))
omega_hat_c = np.copy(omega_hat)
for i in range(n):
for j in range(n):
if i>j:
if cluster[i]!=cluster[j]:
omega_hat_c[i,j] = 0; omega_hat_c[j,i] = 0
# 计算聚类方差矩阵
XXinv = np.linalg.inv(X.T@X)
var = XXinv@X.T@omega_hat_c@X@XXinv
print("Clustered variance-covariance matrix \n", var)
# 计算聚类标准误
se = np.sqrt(np.diag(var))
print("Clustered SEs of beta0 is %6.5f and of beta1 is %6.5f"
% (se[0],se[1]))
Clustered variance-covariance matrix [[ 0.02484674 -0.00062565] [-0.00062565 0.00015646]] Clustered SEs of beta0 is 0.15763 and of beta1 is 0.01251
import wooldridge
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
reduc_women = wooldridge.data('mroz')
# 去掉数据中工资为空的观测值
datas = reduc_women.dropna(subset=['lwage'])
x=datas[['exper','expersq','educ']]
y=datas.loc[:,'lwage']
X = sm.add_constant(x)
# OLS估计法(不处理内生性问题)
result_ols = sm.OLS(y, X).fit()
print(result_ols.summary2())
Results: Ordinary least squares ================================================================= Model: OLS Adj. R-squared: 0.151 Dependent Variable: lwage AIC: 871.1979 Date: 2024-03-25 12:02 BIC: 887.4344 No. Observations: 428 Log-Likelihood: -431.60 Df Model: 3 F-statistic: 26.29 Df Residuals: 424 Prob (F-statistic): 1.30e-15 R-squared: 0.157 Scale: 0.44412 ------------------------------------------------------------------- Coef. Std.Err. t P>|t| [0.025 0.975] ------------------------------------------------------------------- const -0.5220 0.1986 -2.6282 0.0089 -0.9125 -0.1316 exper 0.0416 0.0132 3.1549 0.0017 0.0157 0.0675 expersq -0.0008 0.0004 -2.0628 0.0397 -0.0016 -0.0000 educ 0.1075 0.0141 7.5983 0.0000 0.0797 0.1353 ----------------------------------------------------------------- Omnibus: 77.792 Durbin-Watson: 1.961 Prob(Omnibus): 0.000 Jarque-Bera (JB): 300.917 Skew: -0.753 Prob(JB): 0.000 Kurtosis: 6.822 Condition No.: 2212 ================================================================= * The condition number is large (2e+03). This might indicate strong multicollinearity or other numerical problems.
y = datas[['lwage']].to_numpy()
X = sm.add_constant(datas[['exper','expersq','educ']]).to_numpy()
Z = sm.add_constant(datas[['exper','expersq',
'fatheduc']]).to_numpy()
# IV点估计
beta_IV = np.linalg.inv(Z.T@X)@Z.T@y
# 方差估计
e = y-X@beta_IV
# 计算sigma平方的估计量
sigma2 = (1/(y.size)) * np.inner(e.T,e.T)
se_IV = np.sqrt(np.diag(np.linalg.inv(Z.T@X)@Z.T
@Z@np.linalg.inv(X.T@Z)*sigma2))
print(" Parameter Std. Err.")
print("Intercept %8.4f, %8.4f"
% (beta_IV[0], se_IV[0]) )
print("exper %8.4f, %8.4f"
% (beta_IV[1], se_IV[1]) )
print("expersq %8.4f, %8.4f"
% (beta_IV[2], se_IV[2]) )
print("educ %8.4f, %8.4f"
% (beta_IV[3], se_IV[3]) )
Parameter Std. Err. Intercept -0.0611, 0.4344 exper 0.0437, 0.0133 expersq -0.0009, 0.0004 educ 0.0702, 0.0343
from linearmodels.iv import IV2SLS
reg_2sls=IV2SLS.from_formula(
formula='lwage~1+exper+expersq+[educ~fatheduc]',
data= datas)
print(reg_2sls.fit(cov_type='unadjusted'))
IV-2SLS Estimation Summary
===================================================================
Dep. Variable: lwage R-squared: 0.1430
Estimator: IV-2SLS Adj. R-squared: 0.1370
No. Observations: 428 F-statistic: 25.176
Date: Wed, Nov 15 2023 P-value (F-stat) 0.0000
Time: 17:57:30 Distribution: chi2(3)
Cov. Estimator: unadjusted
Parameter Estimates
===================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
-------------------------------------------------------------------
Intercept -0.0611 0.4344 -0.1407 0.8881 -0.9125 0.7903
exper 0.0437 0.0133 3.2744 0.0011 0.0175 0.0698
expersq -0.0009 0.0004 -2.2107 0.0271 -0.0017 -0.0001
educ 0.0702 0.0343 2.0485 0.0405 0.0030 0.1374
===================================================================
Endogenous: educ
Instruments: fatheduc
Unadjusted Covariance (Homoskedastic)
Debiased: False
Z = sm.add_constant(datas[['exper','expersq','fatheduc',
'motheduc']]).to_numpy()
# 2SLS点估计
Pz = Z@np.linalg.inv(Z.T@Z)@Z.T
Xhat = Pz@X
beta_2SLS = np.linalg.inv(Xhat.T@X)@Xhat.T@y
# 方差估计
e = y-X@beta_2SLS
diagomega = np.diag(np.diag(e@e.T))
se_2SLS = np.sqrt(np.diag(np.linalg.inv(Xhat.T@Xhat)
@(Xhat.T@diagomega@Xhat)
@np.linalg.inv(Xhat.T@Xhat) ))
# 汇报结果
print(" Parameter Robust Std. Err.")
print("Intercept %8.4f, %8.4f"
% (beta_2SLS[0], se_2SLS[0]) )
print("exper %8.4f, %8.4f"
% (beta_2SLS[1], se_2SLS[1]) )
print("expersq %8.4f, %8.4f"
% (beta_2SLS[2], se_2SLS[2]) )
print("educ %8.4f, %8.4f"
% (beta_2SLS[3], se_2SLS[3]) )
Parameter Robust Std. Err. Intercept 0.0481, 0.4278 exper 0.0442, 0.0155 expersq -0.0009, 0.0004 educ 0.0614, 0.0332
from linearmodels.iv import IV2SLS
reg_2sls=IV2SLS.from_formula(
formula='lwage~1+exper+expersq+[educ~motheduc+fatheduc]',
data= datas)
print(reg_2sls.fit())
IV-2SLS Estimation Summary
===================================================================
Dep. Variable: lwage R-squared: 0.1357
Estimator: IV-2SLS Adj. R-squared: 0.1296
No. Observations: 428 F-statistic: 18.611
Date: Wed, Nov 15 2023 P-value (F-stat) 0.0003
Time: 17:58:25 Distribution: chi2(3)
Cov. Estimator: robust
Parameter Estimates
===================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
-------------------------------------------------------------------
Intercept 0.0481 0.4278 0.1124 0.9105 -0.7903 0.8865
exper 0.0442 0.0155 2.8546 0.0043 0.0138 0.0745
expersq -0.0009 0.0004 -2.1001 0.0357 -0.0017 -0.0001
educ 0.0614 0.0332 1.8503 0.0643 -0.0036 0.1264
===================================================================
Endogenous: educ
Instruments: fatheduc, motheduc
Unadjusted Covariance (Heteroskedastic)
Debiased: False
#第一阶段回归
reg_1stage=smf.ols(formula='educ~exper+expersq+fatheduc',
data=datas)
results_1stage=reg_1stage.fit()
print(results_1stage.summary2())
Results: Ordinary least squares ================================================================== Model: OLS Adj. R-squared: 0.170 Dependent Variable: educ AIC: 1846.5075 Date: 2024-03-25 12:02 BIC: 1862.7440 No. Observations: 428 Log-Likelihood: -919.25 Df Model: 3 F-statistic: 30.09 Df Residuals: 424 Prob (F-statistic): 1.18e-17 R-squared: 0.176 Scale: 4.3366 ------------------------------------------------------------------- Coef. Std.Err. t P>|t| [0.025 0.975] ------------------------------------------------------------------- Intercept 9.8870 0.3956 24.9920 0.0000 9.1094 10.6646 exper 0.0468 0.0411 1.1391 0.2553 -0.0340 0.1276 expersq -0.0012 0.0012 -0.9364 0.3496 -0.0036 0.0013 fatheduc 0.2705 0.0289 9.3670 0.0000 0.2137 0.3273 ------------------------------------------------------------------ Omnibus: 10.235 Durbin-Watson: 1.919 Prob(Omnibus): 0.006 Jarque-Bera (JB): 17.236 Skew: -0.103 Prob(JB): 0.000 Kurtosis: 3.961 Condition No.: 1411 ================================================================== * The condition number is large (1e+03). This might indicate strong multicollinearity or other numerical problems.
from scipy.stats import chi2
# 1. 2SLS回归,并计算残差项
reg_2sls=IV2SLS.from_formula(
formula='lwage~1+exper+expersq+[educ~fatheduc+motheduc]',
data=datas)
results_2sls=reg_2sls.fit(cov_type='unadjusted', debiased=True)
# 2. 残差对工具变量与外生变量回归
datas['resid_2sls']=results_2sls.resids
# 3.1 检验OR所需要的回归构建
reg_or=smf.ols(formula='resid_2sls~ 1+exper+expersq+motheduc+fatheduc', data=datas)
results_or=reg_or.fit()
# 3.2 统计检验得出结论
R2 = results_or.rsquared #计算R2
n = results_or.nobs
q = 2-1
teststt=n*R2
pval=1-chi2.cdf(teststt,q)
print(f'Overidentification test: {teststt}')
print(f'p-value: {pval}')
print(f'sample size: {n}')
reg_1stage=smf.ols(formula='educ~exper+expersq+motheduc+fatheduc',
data=datas)
results_1stage=reg_1stage.fit()
datas['resid']=results_1stage.resid
reg_endo=smf.ols(formula='lwage~resid+educ+exper+expersq',data=datas)
results_endo=reg_endo.fit()
print(results_endo.summary2())