Logo

OLS Example with joint significance tests

In [1]: import numpy as np

from scipy import stats

In [2]: import statsmodels.api as sm

In [3]: import matplotlib
In [4]: import matplotlib.pyplot as plt

In [5]: from statsmodels.sandbox.regression.predstd import wls_prediction_std

fix a seed for these examples

In [6]: np.random.seed(9876789)

OLS non-linear curve but linear in parameters

In [7]: nsample = 50

In [8]: sig = 0.5

In [9]: x1 = np.linspace(0, 20, nsample)

In [10]: X = np.c_[x1, np.sin(x1), (x1-5)**2, np.ones(nsample)]

In [11]: beta = [0.5, 0.5, -0.02, 5.]

In [12]: y_true = np.dot(X, beta)

In [13]: y = y_true + sig * np.random.normal(size=nsample)

In [14]: plt.figure()
Out[14]: <matplotlib.figure.Figure at 0xd9463ac>

In [15]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[15]: 
[<matplotlib.lines.Line2D at 0xd837cec>,
 <matplotlib.lines.Line2D at 0xd837f8c>]

In [16]: res = sm.OLS(y, X).fit()

In [17]: print res.params
[ 0.47040466  0.2931004  -0.01826292  5.24935422]

In [18]: print res.bse
[ 0.02853117  0.11215937  0.00250506  0.18499717]

In [19]: print res.predict()
[  4.79278116   5.17262168   5.52726298   5.84073136   6.10281792   6.31075592   6.46967527   6.59175976   6.69424528   6.796588     6.91726779   7.07075203   7.26511865   7.50072896   7.77016828
   8.05946415   8.35038197   8.62342091   8.8610178    9.05043274   9.18584224   9.26929595   9.31037998   9.32464188   9.33103621   9.34881043   9.39434252   9.47845015   9.60461339   9.76840293
   9.95820778  10.15714295  10.34582361  10.50554994  10.62137947  10.68458209  10.69407437  10.65659757  10.58561009  10.49907624  10.41651482  10.3557922   10.33018692  10.34620807  10.4025259
  10.49019023  10.594101    10.69548911  10.77500019  10.81587443]

In [20]: prstd, iv_l, iv_u = wls_prediction_std(res)

In [21]: plt.plot(x1, res.fittedvalues, 'r--.')
Out[21]: [<matplotlib.lines.Line2D at 0xdb406cc>]

In [22]: plt.plot(x1, iv_u, 'r--')
Out[22]: [<matplotlib.lines.Line2D at 0x43908fec>]

In [23]: plt.plot(x1, iv_l, 'r--')
Out[23]: [<matplotlib.lines.Line2D at 0xd863bec>]

In [24]: plt.title('blue: true,   red: OLS')
Out[24]: <matplotlib.text.Text at 0xd36b38c>

In [25]: print res.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     175.9
Date:                Fri, 21 Mar 2014   Prob (F-statistic):           3.30e-25
Time:                        06:55:18   Log-Likelihood:                -38.308
No. Observations:                  50   AIC:                             84.62
Df Residuals:                      46   BIC:                             92.26
Df Model:                           3                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.4704      0.029     16.487      0.000         0.413     0.528
x2             0.2931      0.112      2.613      0.012         0.067     0.519
x3            -0.0183      0.003     -7.290      0.000        -0.023    -0.013
const          5.2494      0.185     28.375      0.000         4.877     5.622
==============================================================================
Omnibus:                        1.779   Durbin-Watson:                   2.359
Prob(Omnibus):                  0.411   Jarque-Bera (JB):                1.440
Skew:                           0.414   Prob(JB):                        0.487
Kurtosis:                       2.933   Cond. No.                         221.
==============================================================================
../../_images/tut_ols_0.png

OLS with dummy variables

In [26]: sig = 1.

suppose observations from 3 groups

In [27]: xg = np.zeros(nsample, int)

In [28]: xg[20:40] = 1

In [29]: xg[40:] = 2

In [30]: print xg
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2]

In [31]: dummy = (xg[:,None] == np.unique(xg)).astype(float)

use group 0 as benchmark

In [32]: X = np.c_[x1, dummy[:,1:], np.ones(nsample)]

In [33]: beta = [1., 3, -3, 10]

In [34]: y_true = np.dot(X, beta)

In [35]: y = y_true + sig * np.random.normal(size=nsample)

In [36]: res2 = sm.OLS(y, X).fit()

In [37]: print res2.params
[ 0.98475721  3.44866047 -2.68110961  9.77981018]

In [38]: print res2.bse
[ 0.06801982  0.64590456  1.05239527  0.35213863]

In [39]: print res.predict()
[  4.79278116   5.17262168   5.52726298   5.84073136   6.10281792   6.31075592   6.46967527   6.59175976   6.69424528   6.796588     6.91726779   7.07075203   7.26511865   7.50072896   7.77016828
   8.05946415   8.35038197   8.62342091   8.8610178    9.05043274   9.18584224   9.26929595   9.31037998   9.32464188   9.33103621   9.34881043   9.39434252   9.47845015   9.60461339   9.76840293
   9.95820778  10.15714295  10.34582361  10.50554994  10.62137947  10.68458209  10.69407437  10.65659757  10.58561009  10.49907624  10.41651482  10.3557922   10.33018692  10.34620807  10.4025259
  10.49019023  10.594101    10.69548911  10.77500019  10.81587443]

In [40]: prstd, iv_l, iv_u = wls_prediction_std(res2)

In [41]: plt.figure()
Out[41]: <matplotlib.figure.Figure at 0xd84336c>

In [42]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[42]: 
[<matplotlib.lines.Line2D at 0xd8b95ec>,
 <matplotlib.lines.Line2D at 0xd8b998c>]

In [43]: plt.figure()
Out[43]: <matplotlib.figure.Figure at 0xd1f2a0c>

In [44]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[44]: 
[<matplotlib.lines.Line2D at 0xd9dc50c>,
 <matplotlib.lines.Line2D at 0xd9dc8cc>]

In [45]: plt.plot(x1, res2.fittedvalues, 'r--.')
Out[45]: [<matplotlib.lines.Line2D at 0xd9dcd8c>]

In [46]: plt.plot(x1, iv_u, 'r--')
Out[46]: [<matplotlib.lines.Line2D at 0xd9dcd0c>]

In [47]: plt.plot(x1, iv_l, 'r--')
Out[47]: [<matplotlib.lines.Line2D at 0xd1f2aec>]

In [48]: plt.title('blue: true,   red: OLS')
Out[48]: <matplotlib.text.Text at 0xd88dbec>

In [49]: print res.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     175.9
Date:                Fri, 21 Mar 2014   Prob (F-statistic):           3.30e-25
Time:                        06:55:24   Log-Likelihood:                -38.308
No. Observations:                  50   AIC:                             84.62
Df Residuals:                      46   BIC:                             92.26
Df Model:                           3                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.4704      0.029     16.487      0.000         0.413     0.528
x2             0.2931      0.112      2.613      0.012         0.067     0.519
x3            -0.0183      0.003     -7.290      0.000        -0.023    -0.013
const          5.2494      0.185     28.375      0.000         4.877     5.622
==============================================================================
Omnibus:                        1.779   Durbin-Watson:                   2.359
Prob(Omnibus):                  0.411   Jarque-Bera (JB):                1.440
Skew:                           0.414   Prob(JB):                        0.487
Kurtosis:                       2.933   Cond. No.                         221.
==============================================================================

In [50]: R = [[0, 1, 0, 0],
   ....:       [0, 0, 1, 0]]
   ....: 
../../_images/tut_ols_1.png

F test joint hypothesis R * beta = 0 i.e. coefficient on both dummy variables equal zero

In [51]: print res2.f_test(R)
<F test: F=array([[ 124.30756422]]), p=[[ 0.]], df_denom=46, df_num=2>

strongly rejected Null of identical constant in 3 groups .. <F test: F=124.19050615860911, p=2.87411973729e-019, df_denom=46, df_num=2>

In [52]: help(res2.f_test)
Help on method f_test in module statsmodels.base.model:

f_test(self, r_matrix, q_matrix=None, cov_p=None, scale=1.0, invcov=None) method of statsmodels.regression.linear_model.OLSResults instance
    Compute an Fcontrast/F-test for a contrast matrix.
    
    Here, matrix `r_matrix` is assumed to be non-singular. More precisely,
    
    r_matrix (pX pX.T) r_matrix.T
    
    is assumed invertible. Here, pX is the generalized inverse of the
    design matrix of the model. There can be problems in non-OLS models
    where the rank of the covariance of the noise is not full.
    
    Parameters
    ----------
    r_matrix : array-like
        q x p array where q is the number of restrictions to test and
        p is the number of regressors in the full model fit.
        If q is 1 then f_test(r_matrix).fvalue is equivalent to
        the square of t_test(r_matrix).t
    q_matrix : array-like
        q x 1 array, that represents the sum of each linear restriction.
        Default is all zeros for each restriction.
    scale : float, optional
        Default is 1.0 for no scaling.
    invcov : array-like, optional
        A qxq matrix to specify an inverse covariance
        matrix based on a restrictions matrix.
    
    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm
    >>> data = sm.datasets.longley.load()
    >>> data.exog = sm.add_constant(data.exog)
    >>> results = sm.OLS(data.endog, data.exog).fit()
    >>> A = np.identity(len(results.params))
    >>> A = A[:-1,:]
    
    This tests that each coefficient is jointly statistically
    significantly different from zero.
    
    >>> print results.f_test(A)
    <F contrast: F=330.28533923463488, p=4.98403052872e-10,
     df_denom=9, df_num=6>
    
    Compare this to
    
    >>> results.F
    330.2853392346658
    >>> results.F_p
    4.98403096572e-10
    
    >>> B = np.array(([0,1,-1,0,0,0,0],[0,0,0,0,1,-1,0]))
    
    This tests that the coefficient on the 2nd and 3rd regressors are
    equal and jointly that the coefficient on the 5th and 6th regressors
    are equal.
    
    >>> print results.f_test(B)
    <F contrast: F=9.740461873303655, p=0.00560528853174, df_denom=9,
     df_num=2>
    
    See also
    --------
    statsmodels.contrasts
    statsmodels.model.t_test

t test for Null hypothesis effects of 2nd and 3rd group add to zero

In [53]: R = [0, 1, -1, 0]

In [54]: print res2.t_test(R)
<T test: effect=array([ 6.12977008]), sd=array([[ 0.58029388]]), t=array([[ 10.5632168]]), p=array([[ 0.]]), df_denom=46>

don’t reject Null at 5% confidence level (note one sided p-value) .. <T test: effect=1.0363792917100714, sd=0.52675137730463362, t=1.9674923243925513, p=0.027586676754860262, df_denom=46>

OLS with small group effects

In [55]: beta = [1., 0.3, -0.0, 10]

In [56]: y_true = np.dot(X, beta)

In [57]: y = y_true + sig * np.random.normal(size=nsample)

In [58]: res3 = sm.OLS(y, X).fit()

In [59]: print res3.f_test(R)
<F test: F=array([[ 0.4929838]]), p=[[ 0.48613705]], df_denom=46, df_num=1>

don’t reject Null of identical constant in 3 groups .. <F test: F=1.9715385826285652, p=0.15083366806, df_denom=46, df_num=2>

Table Of Contents

This Page