In [1]:
# 第五章 线性回归模型及内生性问题 --- Linear Regression Models & Endogeneity Issues
#(《计量经济学编程:以Python语言为工具》严子中、张毅著;中国财经出版社,2024)

Introduction¶

OLS Estimation¶

Estimation of $\beta$¶

In [2]:
import pandas as pd
import numpy as np
# 对应目录下从data_wageedu.dta中读取数据
data = pd.pandas.read_stata("dependency/data_incomeedu.dta")
x = data.loc[:,"educ"].to_numpy(dtype='float',
                                na_value=np.nan).reshape(-1,1)
y = data.loc[:,"income"].to_numpy(dtype='float',
                                na_value=np.nan).reshape(-1,1)
y = np.log(y)
X = np.append(np.ones((y.size,1)), x, axis=1)
In [3]:
import time
# 方法一:循环计算
def forloopols(x, y): 
    x_bar = x.mean() 
    y_bar = y.mean() 
    denominator = 0
    numerator = 0
    for i in range(y.size): 
        denominator += (x[i]-x_bar)**2
        numerator += (x[i]-x_bar)*(y[i]-y_bar)
    return numerator/denominator
# 方法二:矩阵计算
def matols(x, y): 
    X = np.append(np.ones((y.size,1)),x,axis=1)
    return np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)), X.T), y)[1]

startTime = time.time() # 计时开始
for k in range(1000):
    slope_v1, = forloopols(x, y)
endTime = time.time()-startTime # 计时结束
startTime = time.time()
for k in range(1000):
    slope_v2, = matols(x, y)
faster_rate = endTime/(time.time()-startTime) 

# 对比循环与矩阵方法的用时差别
print("beta2 is estimated to be %5.3f by for-loop" 
      % (slope_v1))
print("beta2 is estimated to be %5.3f by matrix computation" 
      % (slope_v2))
print("Matrix computation is ", faster_rate," times faster.")
beta2 is estimated to be 0.039 by for-loop
beta2 is estimated to be 0.039 by matrix computation
Matrix computation is  28.338495326625466  times faster.
In [5]:
def timer_ols(method, rep_time=500000):
    startTime = time.time()
    if method == "dot":    
        for ii in range(rep_time):
            np.dot(np.dot(np.linalg.inv(np.dot(X.T,X)), X.T), y)
        print("np.dot() takes %6.5f seconds" 
              % (time.time()-startTime) )
    elif method == "matmul":
        for ii in range(rep_time):
            np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),
                                X.T), y)
        print("np.matmul() takes %6.5f seconds" 
              % (time.time()-startTime))
    elif method == "at": 
        for ii in range(rep_time):
            (np.linalg.inv(X.T @ X) @ X.T) @ y
        print("@ takes %6.5f seconds" 
              % (time.time()-startTime))
# 执行
timer_ols("dot")
timer_ols("matmul")
timer_ols("at")
np.dot() takes 4.82326 seconds
np.matmul() takes 4.69728 seconds
@ takes 4.72106 seconds
In [ ]:
import statsmodels.api as sm
# 读取数据并且添加常数项列
X = sm.add_constant(x)
# 采用回归方程进行数据输入并进行拟合以及估计
result_ols = sm.OLS(y, X).fit()

print(result_ols.summary2())
OLS Regression Results                            
=========================================================================
Dep. Variable:                      y   R-squared:                  0.041
Model:                            OLS   Adj. R-squared:             0.032
Method:                 Least Squares   F-statistic:                4.778
Date:                Sun, 21 May 2023   Prob (F-statistic):        0.0309
Time:                        21:30:58   Log-Likelihood:           -96.436
No. Observations:                 115   AIC:                        196.9
Df Residuals:                     113   BIC:                        202.4
Df Model:                           1                                    
Covariance Type:            nonrobust                                    
=========================================================================
                 coef    std err          t      P>|t|     [0.025  0.975]
-------------------------------------------------------------------------
const          9.9957      0.159     62.705      0.000      9.680  10.312
x1             0.0392      0.018      2.186      0.031      0.004   0.075
=========================================================================
Omnibus:                       94.673   Durbin-Watson:              1.053
Prob(Omnibus):                  0.000   Jarque-Bera (JB):        1055.552
Skew:                           2.642   Prob(JB):               6.16e-230
Kurtosis:                      16.870   Cond. No.                    27.2
=========================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is 
correctly specified.

Plot the Fitted Line¶

In [ ]:
import matplotlib.pyplot as plt
beta = np.linalg.inv((X.T @ X))@ X.T @ y
plt.figure(figsize=(8,4))
plt.plot(x, y, 'ok', label='Original data', 
         markersize=8, alpha=0.4)
plt.plot(x, beta[0] + beta[1]*x, 'gray', label='Fitted line')
plt.title("Figure 5.1: The OLS Fitted Line and the Original Data",
          fontsize=14)
plt.xlabel(r"$x$", fontsize=12)
plt.ylabel(r"$y$", fontsize=12)
plt.legend(fontsize=12)
plt.show()

Variance of $\widehat{\beta}$¶

In [8]:
N = y.size
# 利用OLS表达式估计参数的值
beta = np.dot(np.dot(np.linalg.inv(np.dot(X.T,X)), X.T), y)
# 计算残差
e = y-np.dot(X,beta)
# 计算sigma平方的估计量
sigma2 = (1/(N-2)) * np.inner(e.T,e.T)
print("Estimate of sigma2:",sigma2[0])
# 计算OLS估计量方差矩阵
var = np.linalg.inv(np.dot(X.T,X))*sigma2
print("Variance-covariance matrix \n", var)
# 计算OLS估计量标准误
se = np.sqrt(np.diag(var))
print("SEs of beta0 is %6.3f and of beta1 is %6.3f"
      % (se[0],se[1]))
Estimate of sigma2: [0.31879951]
Variance-covariance matrix 
 [[ 0.02541155 -0.00269516]
 [-0.00269516  0.00032085]]
SEs of beta0 is  0.159 and of beta1 is  0.018

Goodness of Fit¶

In [9]:
n = y.size
k = X.shape[1]
M0 = np.eye(n) - np.ones((n,n))/n
# 计算RSS与TSS
RSS = e.T@e
TSS = y.T@M0@y
# 计算R2与adj. R2
R2 = 1 - RSS/TSS
adjR2 = 1- (RSS/(n-k))/(TSS/(n-1))
print("R-squared: %5.4f" % (R2))
print("Adj. R-squared: %5.4f" % (adjR2))
R-squared: 0.0406
Adj. R-squared: 0.0321

Inferences¶

Student's $t$-test¶

In [10]:
print("t-stat of beta1: %4.3f"
      % ((beta.reshape(-1)/se)[1]) )
t-stat of beta1: 2.186
In [11]:
from scipy.stats import t
cv = t.ppf(0.975, df=N-2) # 临界值
lower_bounds = beta.reshape(-1) - se*cv
upper_bounds = beta.reshape(-1) + se*cv
print("95 CIs of beta1: [%5.3f, %5.3f]" % (lower_bounds[1],upper_bounds[1]))
95 CIs of beta1: [0.004, 0.075]

$z$-test¶

In [12]:
from scipy.stats import norm
# 计算S_x与S_ex
Sx = (X.T@X)/n
Q = np.diag(e.reshape(-1))**2
Sex = (X.T@Q@X)/n
Avar = np.linalg.inv(Sx)@Sex@np.linalg.inv(Sx)/n
Ase = np.sqrt(np.diag(Avar))
zstat = beta.reshape(-1)/Ase
print("z-stat beta1 is %4.3f" % (zstat[1]) )
cv = norm.ppf(0.975) # 临界值
lower_bounds = beta.reshape(-1) - Ase*cv
upper_bounds = beta.reshape(-1) + Ase*cv
print("95 asy. CIs of beta1 is [%5.3f,%5.3f]"
      % (lower_bounds[1],upper_bounds[1]))
z-stat beta1 is 2.223
95 asy. CIs of beta1 is [0.005,0.074]

$F$-test¶

In [ ]:
R = [[0, 1]]
print(result_ols.f_test(R))
In [ ]:
R = [[1, -1]]
print(result_ols.f_test(R))

Testing for Heteroskedasticity¶

In [ ]:
from statsmodels.stats.diagnostic import het_breuschpagan, het_white
model = sm.OLS(y, X).fit()
bp_result = het_breuschpagan(model.resid, X)
white_result = het_white(model.resid, X)
print('BP test p-value: '+str(bp_result[3]))
print('White test p-value: '+str(white_result[3]))

Residual Analysis¶

Visualization of Residuals¶

In [ ]:
import seaborn as sns
sigma2_e = np.sqrt(np.sum(e**2)/(N-2))
r = e/sigma2_e
# 作图
reference = np.random.normal(size=200000)
plt.figure(figsize=(8,4))
sns.kdeplot(reference, color='black', lw=2,
            label=r"$N(0,1)$ PDF", linestyle="--")
sns.histplot(r, kde=True, linewidth=0, stat='density',
             color='k', alpha=0.6,
             label="Histogram of standardized residuals")
plt.title("Figure 5.2: Visualization the Standardized Residuals",
          fontsize=14)
plt.ylabel("density", fontsize=12)
plt.xlabel(r"$r_i$", fontsize=12)
plt.legend(loc="best", fontsize=12)
plt.show()

Testing for Normality¶

In [16]:
from scipy.stats import normaltest
k2, p = normaltest(r)
print("K2 statistic: %5.4f \np-value: %5.4f" % (k2,p))
K2 statistic: 94.6726 
p-value: 0.0000

Robust Standard Errors¶

White Standard Errors¶

In [17]:
omega_hat = (e@e.T)*n/(n-k)
# 计算稳健方差矩阵
XXinv = np.linalg.inv(X.T@X)
var = XXinv@X.T@np.diag(np.diag(omega_hat))@X@XXinv
print("White variance-covariance matrix \n", var)
# 计算稳健标准误
se = np.sqrt(np.diag(var))
print("White SEs of beta0 is %6.5f and of beta1 is %6.5f" % (se[0],se[1]))
White variance-covariance matrix 
 [[ 0.02255978 -0.00250354]
 [-0.00250354  0.00031564]]
White SEs of beta0 is 0.15020 and of beta1 is 0.01777

Clustered Standard Errors¶

Call Stata from Python¶

In [ ]:
# 配置Python-Stata交互
import os
os.chdir('/Applications/Stata/utilities')
from pystata import config
config.init('mp')
# 将数据传输至Stata
from pystata import stata
stata.pdataframe_to_data(data, force=True)
# 回归并使用聚类标准误
stata.run(
    '''gen lincome = ln(income)
    regress lincome educ, cluster(village_pid)
    ''')
Linear regression                      Number of obs     =        115
                                       F(1, 7)           =       9.80
                                       Prob > F          =     0.0166
                                       R-squared         =     0.0406
                                       Root MSE          =     .56462

                   (Std. err. adjusted for 8 clusters in village_pid)
---------------------------------------------------------------------
        |               Robust
lincome | Coefficient  std. err.    t    P>|t|   [95% conf. interval]
--------+------------------------------------------------------------
   educ |   .0391548   .0125084   3.13   0.017   .0095772    .0687324
  _cons |   9.995728   .1576285  63.41   0.000   9.622995    10.36846
---------------------------------------------------------------------

Code in Python¶

In [18]:
# 从数据中获取cluster变量
cluster = data.loc[:,"village_pid"].to_numpy(dtype='float',na_value=np.nan).reshape(-1,1)
# 利用np.unique获取G的数值
G = len(np.unique(cluster))
# 计算聚类方差的Omega
omega_hat = (e@e.T)*((G)/(G-1))*((n-1)/(n-k))
omega_hat_c = np.copy(omega_hat)
for i in range(n):
    for j in range(n):
        if i>j:
            if cluster[i]!=cluster[j]:
                omega_hat_c[i,j] = 0; omega_hat_c[j,i] = 0
In [19]:
# 计算聚类方差矩阵
XXinv = np.linalg.inv(X.T@X)
var = XXinv@X.T@omega_hat_c@X@XXinv
print("Clustered variance-covariance matrix \n", var)
# 计算聚类标准误
se = np.sqrt(np.diag(var))
print("Clustered SEs of beta0 is %6.5f and of beta1 is %6.5f" 
      % (se[0],se[1]))
Clustered variance-covariance matrix 
 [[ 0.02484674 -0.00062565]
 [-0.00062565  0.00015646]]
Clustered SEs of beta0 is 0.15763 and of beta1 is 0.01251

Endogeneity¶

Instrumental Variable Estimations¶

Just-identified Case¶

In [20]:
import wooldridge
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
reduc_women = wooldridge.data('mroz')
# 去掉数据中工资为空的观测值
datas = reduc_women.dropna(subset=['lwage'])
x=datas[['exper','expersq','educ']]
y=datas.loc[:,'lwage']
X = sm.add_constant(x)
# OLS估计法(不处理内生性问题)
result_ols = sm.OLS(y, X).fit()
print(result_ols.summary2())
                 Results: Ordinary least squares
=================================================================
Model:              OLS              Adj. R-squared:     0.151   
Dependent Variable: lwage            AIC:                871.1979
Date:               2024-03-25 12:02 BIC:                887.4344
No. Observations:   428              Log-Likelihood:     -431.60 
Df Model:           3                F-statistic:        26.29   
Df Residuals:       424              Prob (F-statistic): 1.30e-15
R-squared:          0.157            Scale:              0.44412 
-------------------------------------------------------------------
           Coef.    Std.Err.      t      P>|t|     [0.025    0.975]
-------------------------------------------------------------------
const     -0.5220     0.1986   -2.6282   0.0089   -0.9125   -0.1316
exper      0.0416     0.0132    3.1549   0.0017    0.0157    0.0675
expersq   -0.0008     0.0004   -2.0628   0.0397   -0.0016   -0.0000
educ       0.1075     0.0141    7.5983   0.0000    0.0797    0.1353
-----------------------------------------------------------------
Omnibus:             77.792       Durbin-Watson:          1.961  
Prob(Omnibus):       0.000        Jarque-Bera (JB):       300.917
Skew:                -0.753       Prob(JB):               0.000  
Kurtosis:            6.822        Condition No.:          2212   
=================================================================
* The condition number is large (2e+03). This might indicate
strong multicollinearity or other numerical problems.
In [21]:
y = datas[['lwage']].to_numpy()
X = sm.add_constant(datas[['exper','expersq','educ']]).to_numpy()
Z = sm.add_constant(datas[['exper','expersq',
                           'fatheduc']]).to_numpy()
# IV点估计
beta_IV = np.linalg.inv(Z.T@X)@Z.T@y
# 方差估计
e =  y-X@beta_IV
# 计算sigma平方的估计量
sigma2 = (1/(y.size)) * np.inner(e.T,e.T)
se_IV = np.sqrt(np.diag(np.linalg.inv(Z.T@X)@Z.T
                        @Z@np.linalg.inv(X.T@Z)*sigma2))

print("          Parameter  Std. Err.")
print("Intercept %8.4f, %8.4f" 
      % (beta_IV[0], se_IV[0]) )
print("exper     %8.4f, %8.4f" 
      % (beta_IV[1], se_IV[1]) )
print("expersq   %8.4f, %8.4f" 
      % (beta_IV[2], se_IV[2]) )
print("educ      %8.4f, %8.4f" 
      % (beta_IV[3], se_IV[3]) )
          Parameter  Std. Err.
Intercept  -0.0611,   0.4344
exper       0.0437,   0.0133
expersq    -0.0009,   0.0004
educ        0.0702,   0.0343
In [ ]:
from linearmodels.iv import IV2SLS 
reg_2sls=IV2SLS.from_formula(
         formula='lwage~1+exper+expersq+[educ~fatheduc]',
         data= datas)
print(reg_2sls.fit(cov_type='unadjusted'))
IV-2SLS Estimation Summary                  
===================================================================
Dep. Variable:                lwage   R-squared:             0.1430
Estimator:                  IV-2SLS   Adj. R-squared:        0.1370
No. Observations:               428   F-statistic:           25.176
Date:              Wed, Nov 15 2023   P-value (F-stat)       0.0000
Time:                      17:57:30   Distribution:         chi2(3)
Cov. Estimator:          unadjusted                                

                       Parameter Estimates                  
===================================================================
         Parameter  Std. Err.   T-stat  P-value  Lower CI  Upper CI
-------------------------------------------------------------------
Intercept  -0.0611     0.4344  -0.1407   0.8881   -0.9125    0.7903
exper       0.0437     0.0133   3.2744   0.0011    0.0175    0.0698
expersq    -0.0009     0.0004  -2.2107   0.0271   -0.0017   -0.0001
educ        0.0702     0.0343   2.0485   0.0405    0.0030    0.1374
===================================================================
Endogenous: educ
Instruments: fatheduc
Unadjusted Covariance (Homoskedastic)
Debiased: False

Two-Stage Least Squares Estimation¶

In [22]:
Z = sm.add_constant(datas[['exper','expersq','fatheduc',
                           'motheduc']]).to_numpy()
# 2SLS点估计
Pz = Z@np.linalg.inv(Z.T@Z)@Z.T
Xhat = Pz@X
beta_2SLS = np.linalg.inv(Xhat.T@X)@Xhat.T@y
In [23]:
# 方差估计
e =  y-X@beta_2SLS
diagomega =  np.diag(np.diag(e@e.T))
se_2SLS = np.sqrt(np.diag(np.linalg.inv(Xhat.T@Xhat)
                          @(Xhat.T@diagomega@Xhat)
                          @np.linalg.inv(Xhat.T@Xhat)  ))
# 汇报结果
print("          Parameter  Robust Std. Err.")
print("Intercept %8.4f, %8.4f" 
      % (beta_2SLS[0], se_2SLS[0]) )
print("exper     %8.4f, %8.4f" 
      % (beta_2SLS[1], se_2SLS[1]) )
print("expersq   %8.4f, %8.4f" 
      % (beta_2SLS[2], se_2SLS[2]) )
print("educ      %8.4f, %8.4f" 
      % (beta_2SLS[3], se_2SLS[3]) )
          Parameter  Robust Std. Err.
Intercept   0.0481,   0.4278
exper       0.0442,   0.0155
expersq    -0.0009,   0.0004
educ        0.0614,   0.0332
In [ ]:
from linearmodels.iv import IV2SLS 
reg_2sls=IV2SLS.from_formula(
         formula='lwage~1+exper+expersq+[educ~motheduc+fatheduc]',
         data= datas)
print(reg_2sls.fit())
IV-2SLS Estimation Summary                  
===================================================================
Dep. Variable:                lwage   R-squared:             0.1357
Estimator:                  IV-2SLS   Adj. R-squared:        0.1296
No. Observations:               428   F-statistic:           18.611
Date:              Wed, Nov 15 2023   P-value (F-stat)       0.0003
Time:                      17:58:25   Distribution:         chi2(3)
Cov. Estimator:              robust                                

                       Parameter Estimates                  
===================================================================
         Parameter  Std. Err.   T-stat  P-value  Lower CI  Upper CI
-------------------------------------------------------------------
Intercept   0.0481     0.4278   0.1124   0.9105   -0.7903    0.8865
exper       0.0442     0.0155   2.8546   0.0043    0.0138    0.0745
expersq    -0.0009     0.0004  -2.1001   0.0357   -0.0017   -0.0001
educ        0.0614     0.0332   1.8503   0.0643   -0.0036    0.1264
===================================================================
Endogenous: educ
Instruments: fatheduc, motheduc
Unadjusted Covariance (Heteroskedastic)
Debiased: False

Testing for IV Validity¶

Testing for IV Relevance¶

In [24]:
#第一阶段回归
reg_1stage=smf.ols(formula='educ~exper+expersq+fatheduc',
                   data=datas)
results_1stage=reg_1stage.fit()
print(results_1stage.summary2())
                 Results: Ordinary least squares
==================================================================
Model:              OLS              Adj. R-squared:     0.170    
Dependent Variable: educ             AIC:                1846.5075
Date:               2024-03-25 12:02 BIC:                1862.7440
No. Observations:   428              Log-Likelihood:     -919.25  
Df Model:           3                F-statistic:        30.09    
Df Residuals:       424              Prob (F-statistic): 1.18e-17 
R-squared:          0.176            Scale:              4.3366   
-------------------------------------------------------------------
                Coef.   Std.Err.     t     P>|t|    [0.025   0.975]
-------------------------------------------------------------------
Intercept       9.8870    0.3956  24.9920  0.0000   9.1094  10.6646
exper           0.0468    0.0411   1.1391  0.2553  -0.0340   0.1276
expersq        -0.0012    0.0012  -0.9364  0.3496  -0.0036   0.0013
fatheduc        0.2705    0.0289   9.3670  0.0000   0.2137   0.3273
------------------------------------------------------------------
Omnibus:               10.235       Durbin-Watson:          1.919 
Prob(Omnibus):         0.006        Jarque-Bera (JB):       17.236
Skew:                  -0.103       Prob(JB):               0.000 
Kurtosis:              3.961        Condition No.:          1411  
==================================================================
* The condition number is large (1e+03). This might indicate
strong multicollinearity or other numerical problems.

Testing for IV Exogeneity¶

In [ ]:
from scipy.stats import chi2
# 1. 2SLS回归,并计算残差项
reg_2sls=IV2SLS.from_formula(
         formula='lwage~1+exper+expersq+[educ~fatheduc+motheduc]',
         data=datas)
results_2sls=reg_2sls.fit(cov_type='unadjusted', debiased=True)
# 2. 残差对工具变量与外生变量回归
datas['resid_2sls']=results_2sls.resids
# 3.1 检验OR所需要的回归构建
reg_or=smf.ols(formula='resid_2sls~ 1+exper+expersq+motheduc+fatheduc', data=datas)
results_or=reg_or.fit()
# 3.2 统计检验得出结论
R2 = results_or.rsquared #计算R2
n  = results_or.nobs     
q  = 2-1
teststt=n*R2
pval=1-chi2.cdf(teststt,q)

print(f'Overidentification test: {teststt}')
print(f'p-value: {pval}')
print(f'sample size: {n}')

Testing for Endogeneity¶

In [ ]:
reg_1stage=smf.ols(formula='educ~exper+expersq+motheduc+fatheduc',
                   data=datas)
results_1stage=reg_1stage.fit()
datas['resid']=results_1stage.resid
reg_endo=smf.ols(formula='lwage~resid+educ+exper+expersq',data=datas)
results_endo=reg_endo.fit()

print(results_endo.summary2())