The csv data is based on the following mysql query on the gdeld_events_country_ETH table
import numpy as np # load prev. data for prediction
import os # del files, sys exit
import pandas as pd
# get fresh data from database:
import mysql.connector
from mysql.connector import Error
def execute_query(query, cnx):
cursor = cnx.cursor(raw=True, buffered=True)
try:
cursor.execute(query)
result = cursor.fetchall()
cursor.close()
except Error as e:
print("sql query err")
#print(e)
errorstring = format(str(e.args[0]))
errornumber = errorstring[:4]
print(errornumber)
print(errorstring)
#result = "test"
return result
if 1 == 2:
mysql_config = {
'user': 'DB-user',
'password': 'DB-password',
'host': 'DB-server',
'database': 'datacoll',
'raise_on_warnings': True,
'port': '45506',
}
if 1 == 1:
mysql_config = {
'user': 'DB-user',
'password': 'DB-password',
'host': '127.0.0.1',
'database': 'datacoll',
'raise_on_warnings': True,
'port': '3306',
'ssl_disabled': True
}
try:
cnx = mysql.connector.connect(**mysql_config)
if cnx.is_connected():
#cursor = cnx.cursor(raw=True, buffered=True)
print('connected to datacoll 2') #
except Error as e:
print("mysql DB connection error")
#print("filename: " + filename + " row: " + str(row))
print(e)
print( "exception main: !! connection error in delete old")
print(format(str(e.args[0])))
os.sys.exit(1)
# print(db)
# the model will be for this country:
# CHE AUT DEU ER ETH SDN SYR USA RW
country = "ETH"
query_general = " select Day\
, sum(NumArticles) as 'total'\
, sum( if(EventRootCode = '01', NumArticles, 0) ) as '01'\
, sum( if(EventRootCode = '02', NumArticles, 0) ) as '02'\
, sum( if(EventRootCode = '03', NumArticles, 0) ) as '03'\
, sum( if(EventRootCode = '04', NumArticles, 0) ) as '04'\
, sum( if(EventRootCode = '05', NumArticles, 0) ) as '05'\
, sum( if(EventRootCode = '06', NumArticles, 0) ) as '06'\
, sum( if(EventRootCode = '07', NumArticles, 0) ) as '07'\
, sum( if(EventRootCode = '08', NumArticles, 0) ) as '08'\
, sum( if(EventRootCode = '09', NumArticles, 0) ) as '09'\
, sum( if(EventRootCode = '10', NumArticles, 0) ) as '10'\
, sum( if(EventRootCode = '11', NumArticles, 0) ) as '11'\
, sum( if(EventRootCode = '12', NumArticles, 0) ) as '12'\
, sum( if(EventRootCode = '13', NumArticles, 0) ) as '13'\
, sum( if(EventRootCode = '14', NumArticles, 0) ) as '14'\
, sum( if(EventRootCode = '15', NumArticles, 0) ) as '15'\
, sum( if(EventRootCode = '16', NumArticles, 0) ) as '16'\
, sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
, sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
, sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
, sum( if(EventRootCode = '20', NumArticles, 0) ) as '20'\
from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"
query_19 = " select Day\
, sum(NumArticles) as 'total'\
, sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
, sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
, sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"
columns_19 = ["Day", "total", "17", "18", "19"]
columns_general = ["Day", "total", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]
#"select * from gdelt_events limit 2;"
result = execute_query(query_general, cnx)
#print(result)
column_day = []
column_list = []
df_1 = pd.DataFrame( columns=columns_general)
df_1 = df_1.fillna(0) # with 0s rather than NaNs
for col in range(len(columns_general)):
#print(col, columns_general[col])
for item in result:
column_list.append(bytes(item[col]).decode())
df_1[columns_general[col]] = column_list
#print(df_1.head())
column_list = []
#print(column_day)
#pd.DataFrame()
#column_list = np.asarray(column_list)
#column_list = column_list.astype(int)
#print(column_list)
#df_1 = pd.DataFrame(column_day, columns = ['Day'])
#df_1['19'] = column_list
print(df_1.head())
#print(df_1.dtypes)
df_1.to_csv(r'ar_data_'+country+'.csv')
#np.save('ar_data_'+country+'.npy', column_list)
### Load data and convert date
The columns 01- 20 depict the gdelt root event code like "fight" for 19 or "coerce" for 17.
See also Column definition: column EventRootCode on page https://mind-node.net/gdelt-event-2-0-cheat-sheet/ or the Cameo event table on https://mind-node.net/cameo-event-codes/ An example usage on page: https://mind-node.net/country-dashboard/ The original description is from the gdelt project: http://data.gdeltproject.org/documentation/CAMEO.Manual.1.1b3.pdf
I save the sql result in a csv file and load the for repeated runs as this is faster.
import pandas as pd
import numpy as np
#csvfilename = 'events_ETH_Day_NumArt_sumRootCode.csv'
csvfilename = 'ar_data_ETH.csv'
# filepath relative to jupyter notebook
csvfilepath = '../data/'
csvfilepath = ''
df_1 = pd.read_csv(csvfilepath + csvfilename
,delimiter=','
#,skipinitialspace = True
#,encoding='latin_1'
,header=0
,index_col=False
#converters = {"account2no":conv_i})
#,dtype={"Kategorie CIP":'str'}
)
# change format of datetime
df_1['Day'] = pd.to_datetime(df_1['Day'], format='%Y%m%d')
df_1 = df_1.drop(['Unnamed: 0'], axis=1)
df_1['day_of_week'] = df_1['Day'].dt.dayofweek
df_1['week'] = df_1['Day'].dt.week
print(df_1.dtypes)
print(df_1.head(9))
listofcolumns = df_1.columns
print(listofcolumns)
print("shape: ", df_1.shape)
#df_1.plot(subplots=True, figsize=(20,25), grid=True)
df_1['19'].plot(subplots=True, figsize=(20,15), grid=True, style= 'k.')
df_1['19'].describe()
def add_stats(df_stats, columnName):
# add rolling mean, min, max to each column of a given dataframe
# create window for calculating avg.
wwidth = 5 # window width
window2 = df_stats[columnName].rolling(window=wwidth) # shifted
# add statistical data
df_stats[columnName+'-avg'] = window2.mean()
df_stats[columnName+'-min'] = window2.min()
df_stats[columnName+'-max'] = window2.max()
columnNameShift=columnName+'-t-'+str(wwidth)
shifted = df_stats[columnName].shift(wwidth-1)
df_stats[columnNameShift]=df_stats[columnName].shift(wwidth-1)
# testing expanding window
window = df_stats[columnName].expanding()
return df_stats
def loop_columns(df_loop, exclude=[]):
# loop over all comumns and exclude the given list of columns
# df_loop: df to loop over columns
# exclude: list of column names to exclude from loops
# returns a df (which might be altered)
for (columnName, columnData) in df_loop.iteritems():
if not columnName in exclude:
# Do s.th. with this column, e.g. call a function
#print('Colunm Name : ', columnName)
df_loop = add_stats(df_loop, columnName)
#df = None
return df_loop
df_2 = df_1.copy()
df_2 = loop_columns(df_2, ["Day", "day_of_week", "week"])
df_2['month'] = [df_2.Day[i].month for i in range(len(df_2))]
df_2['day'] = [df_2.Day[i].day for i in range(len(df_2))]
df_2['year'] = [df_2.Day[i].year for i in range(len(df_2))]
df_2['day_diff'] = [(df_2.Day[i] - df_2.Day[0]).days for i in range(len(df_2))]
df_2['const']=1
# look at specific columns, everything's right?
print(df_2.iloc[:,:].head().dropna())
#print(df_2.dtypes)
# delete old df to free memory:
#del df_1
# show python vars in buffer:
#%whos
df_2['19'].describe()
I am interested in predicting column 19 "fight". To see which data can help me to predict this i test the granger causality for other columns.
# example from statsmodel
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
import numpy as np
data = sm.datasets.macrodata.load_pandas()
data = data.data[['realgdp', 'realcons']].pct_change().dropna()
print(data)
data.info()
gc_res = grangercausalitytests(data, 4, verbose=False)
for key in gc_res.keys():
print(key)
_F_test_ = gc_res[key][0]['params_ftest'][0]
print(_F_test_)
#print(gc_res)
from statsmodels.tsa.stattools import grangercausalitytests
df_2 = df_2.dropna()
def F_Test(df_loop, columnName, columnTest='19'):
lag = 109
F_testt = []
try:
gc_res = grangercausalitytests(df_2[[columnTest, columnName]], lag, verbose=False)
#print(gc_res)
#print(len(gc_res))
for key in gc_res.keys():
_F_test_ = gc_res[key][0]['params_ftest'][0]
#_F_test_ = granger_test_result[key][0]['params_ftest'][0]
#print(columnName + " "+ str(key)+" " +str(_F_test_))
F_testt.append({ 'columnName': columnName, 'lag' : key, 'F_Test': _F_test_})
return F_testt
except:
#print("failed granger test: ", columnName)
pass
def loop_columns(df_loop, exclude, function):
# loop over all comumns and exclude the given list of columns
# df_loop: df to loop over columns
# exclude: list of column names to exclude from loops
# function: function to call for each column
# returns: depends on function called
F_tests = []
for (columnName, columnData) in df_loop.iteritems():
if not columnName in exclude:
# Do s.th. with this column, e.g. call a function
#print('Colunm Name : ', columnName)
if function == 'F_Test':
F_tests.append(F_Test(df_loop, columnName))
# remove none values
#F_tests = [i for i in F_tests if i]
return F_tests
#df_2.dropna(0)
best_F = loop_columns(df_2[["total","01","02","03","04","05","06","07","08","09","10","11","12","13","14","15", "16"]], ["Day", "19"], 'F_Test')
# flatten list of lists
best_F = [item for sublist in best_F for item in sublist]
# sort by F-test
best_F = sorted(best_F, key=lambda k: k['F_Test'])
print("col-lag- F-test granger (order by F-test)")
column_lag_ftest = []
for item in best_F:
column_lag_ftest.append(str(item['columnName'])+" - "+str(item['lag'])+" - "+str(item['F_Test']))
# print(str(item['columnName'])+" - "+str(item['lag'])+" - "+str(item['F_Test']))
# print the columns lags with the highest f-test value
for item in column_lag_ftest[-20:]:
print(item)
#print(best_F)
from matplotlib import pyplot
import seaborn as sns
# make extra window for plot
#%matplotlib qt
%matplotlib inline
plot_column_y = "19"
plot_column_x = "Day"
ax=df_2.plot(x=plot_column_x, y=plot_column_y, color="red", figsize=(20,15), grid=False)
df_2.plot(x=plot_column_x, y=[plot_column_y+'-min', plot_column_y+'-max'], ax=ax, color="grey")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-t-5' ], ax=ax, color="green")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-avg' ], ax=ax, color="blue")
#df_2.plot(title=plot_column_y+" avg, min, max")
#df_2.plot.close()
pyplot.show()
# compare two datasets from the imported data:
%matplotlib inline
fig, ax1 = pyplot.subplots(figsize=(20,10))
# use twinx for 2 scales
ax2 = ax1#.twinx()
ax1.plot(df_2[plot_column_x], df_2[plot_column_y], 'g-')
ax2.plot(df_2[plot_column_x], df_2['01'], 'b-')
ax1.set_xlabel(plot_column_x)
ax1.set_ylabel(plot_column_y, color='g')
ax2.set_ylabel('01', color='b')
ax1.set(title=plot_column_y+" and 01" )
pyplot.show()
Here i try to find frequencies in the signal (not quite working).
import numpy as np
import matplotlib.pyplot as plt
#data = np.loadtxt("data.txt")
data = np.array([[0, 1.01],
[0.5, 1.04],
[1, 1.05],
[1.5, 1.24]])
t = data[:,0]
signal_t = data[:,1]
dt = t[1]-t[0]
f = np.fft.fftfreq(t.size, dt)
signal_f = np.fft.fft(signal_t)*dt
plt.figure()
plt.stem(np.fft.fftshift(f), np.fft.fftshift(np.abs(signal_f)))
plt.show()
print(df_2.index)
t = np.array(list(range(1, 1070)))
#print(t)
signal_t = df_2['19'].values
dt = t[1]-t[0]
f = np.fft.fftfreq(t.size, dt)
signal_f = np.fft.fft(signal_t)*dt
plt.figure(figsize=(25,4))
x = np.fft.fftshift(f)
y = np.fft.fftshift(np.abs(signal_f))
plt.xlim(-.001, .1)
plt.stem(x, y)
#axes.set_xlim([0,6])
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import scipy.io.wavfile as wave
# rate,data = wave.read('57.wav')
data = df_2['19']
spectre = np.fft.fft(data)
freq = np.fft.fftfreq(data.size, 100)
mask=freq>0
plt.plot(freq[mask],np.abs(spectre[mask]))#, figsize=(25,4))
t = np.linspace(0, 2*np.pi, 1000, endpoint=True)
f = 3.0 # Frequency in Hz
A = 100.0 # Amplitude in Unit
s = A * np.sin(2*np.pi*f*t) # Signal
plt.plot(t,s)
plt.xlabel('Time ($s$)')
plt.ylabel('Amplitude ($Unit$)')
plt.show()
Y = np.fft.fft(s)
plt.plot(Y)
# create a difference transform of the dataset
def difference(dataset):
diff = list()
for i in range(1, len(dataset)):
value = dataset[i] - dataset[i - 1]
diff.append(value)
return np.array(diff)
# calc diff and ln diff
df_2[plot_column_y+'-ln'] = np.log(df_2[plot_column_y])
df_2[plot_column_y+'-ln'+'-diff'] = df_2[plot_column_y+'-ln'].dropna().diff()
df_2[plot_column_y+'-diff'] = df_2[plot_column_y].diff()
df_2.head()
# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
# Levels
axes[0].plot(df_2.index._mpl_repr(),df_2[plot_column_y+'-diff'], '-')
axes[0].set(title='ln of '+plot_column_y)
# Log difference
axes[1].plot(df_2.index._mpl_repr(), df_2[plot_column_y+'-ln'+"-diff"], '-')
axes[1].hlines(0, df_2.index[0], df_2.index[-1], 'r')
axes[1].set(title='19 - difference of logs');
# Graph data
#fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
#axes[0].plot(df_2['19-avg'], kind= 'kde')
df_2['19'].plot(kind= 'kde')#, figsize=(20,9)
pyplot.show()
df_2['19'].hist()
pyplot.show()
print('Histogramm shows that the values do not have a gaussian distribution.\n\n')
sns.boxplot( y=df_2['19']);
pyplot.show()
print('This plot draws a box around the 25th and 75th percentiles of the data that captures the middle 50% of observations.\n\
A line is drawn at the 50th percentile (the median) and whiskers are drawn above and below the box \n\
to summarize the general extents of the observations.\n\
Dots are drawn for outliers outside the whiskers or extents of the data.\n\n')
df_1['19'].describe()
df_2['19-sqrt'] = np.sqrt(df_2['19'].values)
df_2['19-sqrt'].plot()
#df_2['19'].plot()
pyplot.show()
df_2['19-sqrt'].hist()
pyplot.show()
df_2['19-log'] = np.log(df_2['19'].values)
df_2['19-log'].plot()
pyplot.show()
print(df_2.sort_values('19-log'))
df_2['19-log'].replace(np.inf, 0, inplace=True)
#pyplot.show()
df_2['19-log'].describe()
from scipy.stats import boxcox
df_2['19-avg-boxcox'], lam = boxcox(df_2['19-avg'].values)
print( ' Lambda: %f ' % lam)
df_2['19-avg-boxcox'].plot()
pyplot.show()
df_2['19-avg-boxcox'].hist()
pyplot.show()
import statsmodels.api as sm
# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
df_2.set_index('Day')
print(df_2.iloc[1:][plot_column_y+'-ln'+'-diff'])
#fig = sm.graphics.tsa.plot_acf(df_2[plot_column_y], lags=20, ax=axes[0])
#fig = sm.graphics.tsa.plot_pacf(df_2[plot_column_y+"-diff"], lags=150, ax=axes[1])
#print(df_2['19-avg-log'].dropna())
fig = sm.graphics.tsa.plot_acf(df_2['19'].dropna(0), lags=150, ax=axes[0])
fig = sm.graphics.tsa.plot_pacf(df_2['19'].dropna(0), lags=150, ax=axes[1])
A correlation value calculated between two groups of numbers, such as observations and their lag=1 values, results in a number between -1 and 1. The sign of this number indicates a negative or positive correlation respectively. A value close to zero suggests a weak correlation, whereas a value closer to -1 or 1 indicates a strong correlation.
from pandas.plotting import autocorrelation_plot
#print(len(df_2['19'].dropna())) # 1073
#print(len(df_2['19'].dropna())) # 1069
#df_3 = df_2[['Day', '19', '19-avg']].dropna()
#print(df_3.iloc[110:120,0:8])
#fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
#axes[0].plot(autocorrelation_plot(df_2['18']))
#axes[0]
ax = autocorrelation_plot(df_2['19-avg'].dropna())
ax.set_xlim([0,120])
ax.set_ylim([-0.25,0.75])
df_2['19-avg'].dropna().autocorr()
#autocorrelation_plot(df_2['19-avg'].dropna())
pyplot.show()
Dotted lines are provided that indicate any correlation values above those lines are statistically significant (meaningful). ma
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df_2['19-avg'], model= 'multiplicative', freq=365)
result.plot()
pyplot.show()
# data to fit
series_2 = (np.asarray(df_2['01']).astype( 'float32' ) / np.asarray(df_2['01']).astype( 'float32' ).max())
series = (df_2['19'].dropna().astype('float64') / df_2['19'].dropna().astype('float64').max() )
print(series_2.max())
print(series.max())
series = [z for z in series]
#series_2 = df_2[['day_of_week', 'const']]
Iterare over selected pdq and PDQM parameters.
Running the cell below might take a long time (days).
import warnings
from math import sqrt
from statsmodels.tsa.statespace.sarimax import SARIMAX
#from statsmodels.tsa.sarimax_model import SARIMAX
from sklearn.metrics import mean_squared_error
import datetime
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = 'â–ˆ', printEnd = "\r"):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
# Print New Line on Complete
if iteration == total:
print()
# evaluate an SARIMAX model for a given order (p,d,q) (PDQ M)
def evaluate_sarimax_model(X, X_2, arima_order, arima_seasonal_order):
# prepare training dataset
train_size = int(len(X) * 0.66)
train, test = X[0:train_size], X[train_size:]
train_2, test_2 = X_2[0:train_size], X_2[train_size:]
history = [x for x in train]
history_2 = [z for z in train_2]
endog=history
exog=history_2
#print(train)
#print(".............")
#print("train_2 len: " + str(len(train_2)))
#print(train_2)
#print("train len: " + str(len(train)))
#print(train)
#print("length test: %.i" %len(test))
#print(test[709])
#print("-------")
# make predictions
predictions = list()
printProgressBar(0, len(test), prefix = 'Progress:', suffix = 'Complete', length = 80)
for t in range(len(test)):
#print("... testing %.i"%t)
#print(arima_seasonal_order)
printProgressBar(t, len(test), prefix = 'Progress:', suffix = 'Complete', length = 80)
try:
model = SARIMAX(endog, exog=exog, order=arima_order, seasonal_order=arima_seasonal_order)
#model = SARIMAX(endog, order=arima_order, seasonal_order=arima_seasonal_order)
model_fit = model.fit(disp=0) #method='nm'
#print("model_fit done")
#yhat = model_fit.forecast(steps=1, exog=[[2,1]], dynamic= True)
yhat = model_fit.forecast(steps=1, exog=[[history_2[-1]]], dynamic= True)[0]#steps=1, exog=[[2,1]], dynamic= True)
#print("yhat: " + yhat[1])
predictions.append(yhat)
#print(test[t+709])
history.append(test[t])
history_2.append(test_2[t])
except:
print("... testing %.i"%t)
print("... exception %.i"%t)
e = sys.exc_info()[0]
print(e)
#continue
# calculate out of sample error
rmse = np.sqrt(mean_squared_error(test, predictions))
print(rmse)
return rmse
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, dataset_2, my_order, my_seasonal_order, P_values, D_values, Q_values, M_values, p_values, d_values, q_values, searchmode):
#dataset = dataset.astype( 'float32' )
#dataset_2 = dataset_2.astype( 'float32' )
best_score, best_cfg = float("inf"), None
# Test data, variables in SARIMAX model without loop
if searchmode==0:
model = SARIMAX(series, exog=series_2, order=my_order, seasonal_order=my_seasonal_order)
model_fit = model.fit(disp=True)
print(model_fit.summary())
fo = model_fit.forecast(steps=1, exog=[[2]], dynamic= True)[0]
print(fo)
return model_fit
i=1
if searchmode==2 or searchmode==3: # 2 = take pdq and my_seasonal_order
for p in p_values:
for d in d_values:
for q in q_values:
my_order = (p,d,q)
if searchmode==2:
try:
rmse = evaluate_sarimax_model(dataset, dataset_2, my_order, my_seasonal_order)
if rmse < best_score:
best_score, best_cfg = rmse, str(my_order)+str(my_seasonal_order)
print( '\nARIMA%s%s RMSE=%.3f Round %s' % (my_order, my_seasonal_order,rmse, i))
currDate = datetime.datetime.now()
print(currDate)
i+=1
except:
print( '\nexcept ARIMA%s%s Round %s' % (my_order, my_seasonal_order, i))
continue
if searchmode==3:
print('if searchmode==3 is not implemented')
break
if searchmode==1:
for P in P_values:
for D in D_values:
for Q in Q_values:
for M in M_values:
my_seasonal_order = (P,D,Q, M)
try:
rmse = evaluate_sarimax_model(dataset, dataset_2, my_order, my_seasonal_order)
#rmse = evaluate_arima_model(dataset, order)
if rmse < best_score:
best_score, best_cfg = rmse, str(my_order)+str(my_seasonal_order)
print( '\nARIMA%s%s RMSE=%.3f Round %s \n' % (my_order, my_seasonal_order,rmse, i))
currDate = datetime.datetime.now()
print(currDate)
i+=1
except:
print( '\nexcept ARIMA%s%s Round %s' % (my_order, my_seasonal_order, i))
continue
print( ' Best SARIMAX%s RMSE=%.3f ' % (best_cfg, best_score))
#my_order = (9, 1, 4) # good for 19-avg
my_order = (1, 0, 0) # good for 19
my_seasonal_order=(1, 1, 2, 2)
# evaluate parameters
p_values = [0, 1]
d_values = range(0, 1)
q_values = range(0, 1)
# evaluate parameters
P_values = [0, 1, 2]#[0, 1]#range(0, 3)
D_values = [0, 1, 2]#, 1]#range(0, 3)
Q_values = [0, 1, 3]#, 2]#range(0, 3)
M_values = range(0, 3)
#yhat = model_fit.predict(start=len(series), end=len(series))
#print("yhat 1: %.3f"%yhat)
# 0 = take my_order and my_seasonal_order # calc just one model
# 1 = take PDQM and my_order # iterate just seasonality
# 2 = take pdq and my_seasonal_order # iterate just arima
# 3 = take PDQM and pdq # iterate all
searchmode = 1
warnings.filterwarnings("ignore")
evaluate_models(series, series_2, my_order, my_seasonal_order, P_values, D_values, Q_values, M_values, p_values, d_values, q_values, searchmode)
# my_order = (1,1,2) my_seasonal_order=(1,1,1,12) rmse: 48.41061459833725 19-avg
# exception at 118
#my_order = (1,1,2) my_seasonal_order=(1,1,1,12) rmse: 48.41061459833725 rmse: 176.606 19
# Best SARIMAX(1,1,2)(1, 0, 1, 0) RMSE=174.665
# ARIMA(1, 0, 1, 103) RMSE=175.186 Round 1
# ARIMA(1, 0, 1, 104) RMSE=177.115 Round 2
print(0.07985695419329582*df_2['19'].dropna().astype('float64').max() )
I ran the model with different input parameters. Here are the more important results:
RIMA(0, 0, 0, 1) RMSE=35.855 Round 2 2020-01-26 02:22:05.357431 19-avg
Best SARIMAX(2, 0, 2, 0) RMSE=35.943 series_2 = np.asarray(df_2['12']).astype( 'float32' ) series = df_2['19-avg'].dropna().astype('float64')
ARIMA(2, 0, 2, 0) RMSE=175.109 Round 30 series_2 = np.asarray(df_2['03']).astype( 'float32' ) series = df_2['19'].dropna().astype('float64')
Best SARIMAX(1, 1, 2, 2) RMSE=172.336 series_2 = np.asarray(df_2['01']).astype( 'float32' ) series = df_2['19'].dropna().astype('float64')
Best SARIMAX(1, 0, 0)(0, 1, 1, 2) RMSE=0.080 series_2 = (np.asarray(df_2['01']).astype( 'float32' ) / np.asarray(df_2['01']).astype( 'float32' ).max()) series = (df_2['19'].dropna().astype('float64') / df_2['19'].dropna().astype('float64').max() ) print(0.07985695419329582np.asarray(df_2['01']).astype( 'float32' ).max()) = 405.5934703477495 print(0.07985695419329582df_2['19'].dropna().astype('float64').max() ) = 172.97016278267873
series_2: 1073 series_2: 1073
model_fit.summary:
Statespace Model Results
=========================================================================================
Dep. Variable: 19 No. Observations: 1073
Model: SARIMAX(1, 1, 2)x(1, 1, 1, 4) Log Likelihood -6657.311
Date: Sat, 25 Jan 2020 AIC 13330.622
Time: 19:51:22 BIC 13370.410
Sample: 0 HQIC 13345.696
- 1073
Covariance Type: opg
===============================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------
day_of_week -6.9472 2.257 -3.078 0.002 -11.370 -2.524
const 0.0021 8820.484 2.38e-07 1.000 -1.73e+04 1.73e+04
ar.L1 0.6690 0.040 16.778 0.000 0.591 0.747
ma.L1 -0.9981 0.485 -2.057 0.040 -1.949 -0.047
ma.L2 -0.0019 0.039 -0.048 0.962 -0.078 0.075
ar.S.L4 -0.0163 0.052 -0.312 0.755 -0.119 0.086
ma.S.L4 -1.0000 4.274 -0.234 0.815 -9.376 7.376
sigma2 1.47e+04 5.94e+04 0.247 0.805 -1.02e+05 1.31e+05
===================================================================================
Ljung-Box (Q): 35.69 Jarque-Bera (JB): 579232.48
Prob(Q): 0.66 Prob(JB): 0.00
Heteroskedasticity (H): 4.31 Skew: 7.86
Prob(H) (two-sided): 0.00 Kurtosis: 116.00
===================================================================================
Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step).
Forecast:
1073 148.824086
1074 145.935440
dtype: float64
#from statsmodels.tsa.arima_model import AR
from statsmodels.tsa.ar_model import AR
import numpy as np
# create a difference transform of the dataset
def difference(dataset):
diff = list()
for i in range(1, len(dataset)):
value = dataset[i] - dataset[i - 1]
diff.append(value)
return np.array(diff)
columnpredict = "19-avg"
series = df_2[columnpredict].astype('float64').dropna().values
model = AR(series)
model_fit = model.fit(maxlag=6, disp=False)
# should be the save an in DB query cell 1
country = "ETH"
# save model to file
model_fit.save('ar_model_'+country+'_'+columnpredict+'.pkl')
# save the differenced dataset
X = difference(series)
np.save('ar_data_'+country+'_'+columnpredict+'.npy', X)
# save the last ob
np.save('ar_obs_'+country+'_'+columnpredict+'.npy', [series[-1]])
from statsmodels.tsa.ar_model import ARResults
#from statsmodels.tsa.arima_model import ARIMAResults
country = 'ETH'
# load model
columnpredict = "19-avg"
model = ARResults.load('ar_model_'+country+'_'+columnpredict+'.pkl')
#model = ARResults.load('ar_model_'+country+'_'+columnpredict+'.pkl')
data = np.load('ar_data_'+country+'_'+columnpredict+'.npy')
last_ob = np.load('ar_obs_'+country+'_'+columnpredict+'.npy')
print(len(data))
print(data)
print(last_ob)
# make prediction
predictions = model.predict(start=len(data), end=len(data))
print('Predicted difference: %f' % predictions)
# transform prediction
yhat = predictions[0] + last_ob[0]
print('Prediction: %f' % yhat)
from math import sqrt
from sklearn.metrics import mean_squared_error
# add a column with the future value
df_2[columnpredict+"+t1"] = df_2[columnpredict].shift(-1)
X = df_2[[columnpredict, columnpredict+"+t1"]].dropna().values
# split into train and test sets
train_size = int(len(X) * 0.66)
train, test = X[1:train_size], X[train_size:]
train_X, train_y = train[:,0], train[:,1]
test_X, test_y = test[:,0], test[:,1]
# persistence model
def model_persistence(x):
return x
# trained model
predictions_model = list()
def model_predict(x):
predictions = model.predict(x, x)
yhat = predictions[0] + x #last_ob[0]
predictions_model
return predictions
# walk-forward validation
predictions_persistence = list()
predictions_model = list()
i=0
for x in test_X:
#print(x)
yhat_persistence = model_persistence(x)
predictions_persistence.append(yhat_persistence)
yhat_model = model_predict(i+train_size)
predictions_model.append(yhat_model)
i+=1
rmse_persistence = sqrt(mean_squared_error(test_y, predictions_persistence))
rmse_model = sqrt(mean_squared_error(test_y, predictions_model))
print( ' Test RMSE persistence: %.3f ' % rmse_persistence)
print( ' Test RMSE model: %.3f ' % rmse_model)
%matplotlib inline
pyplot.plot(train_y)
pyplot.plot([None for i in train_y] + [x for x in test_y], 'g-', label='test_y')
pyplot.plot([None for i in train_y] + [x for x in predictions_persistence], 'r-', label='prediction_pers')
pyplot.plot([None for i in train_y] + [x for x in predictions_model], 'y-', label='prediction_model')
pyplot.show()
import statsmodels.api as sm
fig, ax = pyplot.subplots()
ax = df_2[columnpredict].plot(ax=ax)
fig = model.plot_predict(len(data),len(data)+20, dynamic=True, ax=ax, plot_insample=False)
pyplot.show()