import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.stats.stattools import durbin_watson
%matplotlib inline
# 画出八个时间序列的图像
fig, axes = plt.subplots(nrows=4, ncols=2,figsize=(10,6))
for i, ax in enumerate(axes.flatten()):
data = df[df.columns[i]]
ax.plot(data, color='red', linewidth=1)
ax.set_title(df.columns[i])
plt.tight_layout();
# step2:Granger’s Causality Test , 检验不同序列之间存在互相影响
maxlag=12
test='ssr_chi2test'
variables=df.columns
def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False):
df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
for c in df.columns:
for r in df.index:
test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
min_p_value = np.min(p_values)
df.loc[r, c] = min_p_value
df.columns = [var + '_x' for var in variables]
df.index = [var + '_y' for var in variables]
return df
grangers_causation_matrix(df, variables = df.columns)
# step3:ADF测试,检验单个变量是否平稳
def adfuller_test(series, signif=0.05, name='', verbose=False):
r = adfuller(series, autolag='AIC')
output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
p_value = output['pvalue']
def adjust(val, length= 6): return str(val).ljust(length)
# Print Summary
print(f' Augmented Dickey-Fuller Test on "{name}"', "\n ", '-'*47)
print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
print(f' Significance Level = {signif}')
print(f' Test Statistic = {output["test_statistic"]}')
print(f' No. Lags Chosen = {output["n_lags"]}')
for key,val in r[4].items():
print(f' Critical value {adjust(key)} = {round(val, 3)}')
if p_value <= signif:
print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
print(f" => Series is Stationary.")
else:
print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
print(f" => Series is Non-Stationary.")
for name, column in df.iteritems():
adfuller_test(column, name=column.name)
print('\n')
Augmented Dickey-Fuller Test on "rgnp"
-----------------------------------------------
Null Hypothesis: Data has unit root. Non-Stationary.
Significance Level = 0.05
Test Statistic = 0.6419
No. Lags Chosen = 2
Critical value 1% = -3.486
Critical value 5% = -2.886
Critical value 10% = -2.58
=> P-Value = 0.9886. Weak evidence to reject the Null Hypothesis.
=> Series is Non-Stationary.
Augmented Dickey-Fuller Test on "pgnp" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = 1.2743 No. Lags Chosen = 1 Critical value 1% = -3.486 Critical value 5% = -2.886 Critical value 10% = -2.58 => P-Value = 0.9965. Weak evidence to reject the Null Hypothesis. => Series is Non-Stationary.
Augmented Dickey-Fuller Test on "ulc" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = 1.3967 No. Lags Chosen = 2 Critical value 1% = -3.486 Critical value 5% = -2.886 Critical value 10% = -2.58 => P-Value = 0.9971. Weak evidence to reject the Null Hypothesis. => Series is Non-Stationary.
Augmented Dickey-Fuller Test on "gdfco" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = 0.5762 No. Lags Chosen = 5 Critical value 1% = -3.488 Critical value 5% = -2.887 Critical value 10% = -2.58 => P-Value = 0.987. Weak evidence to reject the Null Hypothesis. => Series is Non-Stationary.
Augmented Dickey-Fuller Test on "gdf" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = 1.1129 No. Lags Chosen = 7 Critical value 1% = -3.489 Critical value 5% = -2.887 Critical value 10% = -2.58 => P-Value = 0.9953. Weak evidence to reject the Null Hypothesis. => Series is Non-Stationary.
Augmented Dickey-Fuller Test on "gdfim" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = -0.1987 No. Lags Chosen = 1 Critical value 1% = -3.486 Critical value 5% = -2.886 Critical value 10% = -2.58 => P-Value = 0.9387. Weak evidence to reject the Null Hypothesis. => Series is Non-Stationary.
Augmented Dickey-Fuller Test on "gdfcf" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = 1.6693 No. Lags Chosen = 9 Critical value 1% = -3.49 Critical value 5% = -2.887 Critical value 10% = -2.581 => P-Value = 0.9981. Weak evidence to reject the Null Hypothesis. => Series is Non-Stationary.
Augmented Dickey-Fuller Test on "gdfce" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = -0.8159 No. Lags Chosen = 13 Critical value 1% = -3.492 Critical value 5% = -2.888 Critical value 10% = -2.581 => P-Value = 0.8144. Weak evidence to reject the Null Hypothesis. => Series is Non-Stationary.
for name, column in df_differenced.iteritems():
adfuller_test(column, name=column.name)
print('\n')
Augmented Dickey-Fuller Test on "rgnp"
-----------------------------------------------
Null Hypothesis: Data has unit root. Non-Stationary.
Significance Level = 0.05
Test Statistic = -9.0123
No. Lags Chosen = 2
Critical value 1% = -3.489
Critical value 5% = -2.887
Critical value 10% = -2.58
=> P-Value = 0.0. Rejecting Null Hypothesis.
=> Series is Stationary.
Augmented Dickey-Fuller Test on "pgnp" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = -10.9813 No. Lags Chosen = 0 Critical value 1% = -3.488 Critical value 5% = -2.887 Critical value 10% = -2.58 => P-Value = 0.0. Rejecting Null Hypothesis. => Series is Stationary.
Augmented Dickey-Fuller Test on "ulc" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = -8.769 No. Lags Chosen = 2 Critical value 1% = -3.489 Critical value 5% = -2.887 Critical value 10% = -2.58 => P-Value = 0.0. Rejecting Null Hypothesis. => Series is Stationary.
Augmented Dickey-Fuller Test on "gdfco" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = -7.9102 No. Lags Chosen = 3 Critical value 1% = -3.49 Critical value 5% = -2.887 Critical value 10% = -2.581 => P-Value = 0.0. Rejecting Null Hypothesis. => Series is Stationary.
Augmented Dickey-Fuller Test on "gdf" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = -10.0351 No. Lags Chosen = 1 Critical value 1% = -3.489 Critical value 5% = -2.887 Critical value 10% = -2.58 => P-Value = 0.0. Rejecting Null Hypothesis. => Series is Stationary.
Augmented Dickey-Fuller Test on "gdfim" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = -9.4059 No. Lags Chosen = 1 Critical value 1% = -3.489 Critical value 5% = -2.887 Critical value 10% = -2.58 => P-Value = 0.0. Rejecting Null Hypothesis. => Series is Stationary.
Augmented Dickey-Fuller Test on "gdfcf" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = -6.922 No. Lags Chosen = 5 Critical value 1% = -3.491 Critical value 5% = -2.888 Critical value 10% = -2.581 => P-Value = 0.0. Rejecting Null Hypothesis. => Series is Stationary.
Augmented Dickey-Fuller Test on "gdfce" ----------------------------------------------- Null Hypothesis: Data has unit root. Non-Stationary. Significance Level = 0.05 Test Statistic = -5.1732 No. Lags Chosen = 8 Critical value 1% = -3.492 Critical value 5% = -2.889 Critical value 10% = -2.581 => P-Value = 0.0. Rejecting Null Hypothesis. => Series is Stationary.
# step7:选择模型阶数并训练,根据AIC值,lag=4时达到局部最优
model = VAR(df_differenced)
for i in [1,2,3,4,5,6,7,8,9]:
result = model.fit(i)
print('Lag Order =', i)
print('AIC : ', result.aic, '\n')
Lag Order = 1
AIC : -1.3679402315450668
Lag Order = 2
AIC : -1.621237394447824
Lag Order = 3
AIC : -1.76580083870128
Lag Order = 4
AIC : -2.000735164470319
Lag Order = 5
AIC : -1.9619535608363936
Lag Order = 6
AIC : -2.3303386524829053
Lag Order = 7
AIC : -2.592331352347122
Lag Order = 8
AIC : -3.3172619764582087
Lag Order = 9
AIC : -4.804763125958635
# step8:durbin watson test,检验残差项中是否还存在相关性,这一步的目的是确保模型已经解释了数据中所有的方差和模式
out = durbin_watson(model_fitted.resid)
for col, val in zip(df.columns, out):
print(adjust(col), ':', round(val, 2)) # 检验值越接近2,说明模型越好