The csv data is based on the following mysql query on the gdelt_events_country_ETH table.
The gdelt_events_country_ETH table is the fraction of the gdelt events which contain records associated with Ethiopia.
import numpy as np # load prev. data for prediction
import os # del files, sys exit
import pandas as pd
# get fresh data from database:
import mysql.connector
from mysql.connector import Error
def execute_query(query, cnx):
cursor = cnx.cursor(raw=True, buffered=True)
try:
cursor.execute(query)
result = cursor.fetchall()
cursor.close()
except Error as e:
print("sql query err")
#print(e)
errorstring = format(str(e.args[0]))
errornumber = errorstring[:4]
print(errornumber)
print(errorstring)
#result = "test"
return result
if 1 == 2:
mysql_config = {
'user': 'DB-user',
'password': 'DB-password',
'host': 'xxx.xxx.xxx.xxx',
'database': 'DB-name',
'raise_on_warnings': True,
'port': 'DB-port',
}
if 1 == 1:
mysql_config = {
'user': 'DB-user',
'password': 'DB-password',
'host': '127.0.0.1',
'database': 'DB-name',
'raise_on_warnings': True,
'port': '3306',
'ssl_disabled': True
}
try:
cnx = mysql.connector.connect(**mysql_config)
if cnx.is_connected():
#cursor = cnx.cursor(raw=True, buffered=True)
print('connected to datacoll 2') #
except Error as e:
print("mysql DB connection error")
#print("filename: " + filename + " row: " + str(row))
print(e)
print( "exception main: !! connection error in delete old")
print(format(str(e.args[0])))
os.sys.exit(1)
# print(db)
# the model will be for this country:
# CHE AUT DEU ER ETH SDN SYR USA RW
country = "ETH"
query_general = " select Day\
, sum(NumArticles) as 'total'\
, sum( if(EventRootCode = '01', NumArticles, 0) ) as '01'\
, sum( if(EventRootCode = '02', NumArticles, 0) ) as '02'\
, sum( if(EventRootCode = '03', NumArticles, 0) ) as '03'\
, sum( if(EventRootCode = '04', NumArticles, 0) ) as '04'\
, sum( if(EventRootCode = '05', NumArticles, 0) ) as '05'\
, sum( if(EventRootCode = '06', NumArticles, 0) ) as '06'\
, sum( if(EventRootCode = '07', NumArticles, 0) ) as '07'\
, sum( if(EventRootCode = '08', NumArticles, 0) ) as '08'\
, sum( if(EventRootCode = '09', NumArticles, 0) ) as '09'\
, sum( if(EventRootCode = '10', NumArticles, 0) ) as '10'\
, sum( if(EventRootCode = '11', NumArticles, 0) ) as '11'\
, sum( if(EventRootCode = '12', NumArticles, 0) ) as '12'\
, sum( if(EventRootCode = '13', NumArticles, 0) ) as '13'\
, sum( if(EventRootCode = '14', NumArticles, 0) ) as '14'\
, sum( if(EventRootCode = '15', NumArticles, 0) ) as '15'\
, sum( if(EventRootCode = '16', NumArticles, 0) ) as '16'\
, sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
, sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
, sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
, sum( if(EventRootCode = '20', NumArticles, 0) ) as '20'\
from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"
query_19 = " select Day\
, sum(NumArticles) as 'total'\
, sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
, sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
, sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"
columns_all = ["Day", "total", "17", "18", "19"]
#"select * from gdelt_events limit 2;"
result = execute_query(query_19, cnx)
#print(result)
column_day = []
column_list = []
df_1 = pd.DataFrame( columns=columns_all)
df_1 = df_1.fillna(0) # with 0s rather than NaNs
for col in range(len(columns_all)):
for item in result:
column_list.append(bytes(item[col]).decode())
df_1[columns_all[col]] = column_list
column_list = []
#df_1 = pd.DataFrame(column_day, columns = ['Day'])
#df_1['19'] = column_list
print(df_1.head())
#print(df_1.dtypes)
df_1.to_csv(r'ar_data_'+country+'.csv')
#np.save('ar_data_'+country+'.npy', column_list)
### Load data and convert date
Import cvs data as queried above. This is faster than database query when runnung it multiple times.
import pandas as pd
import numpy as np
#csvfilename = 'events_ETH_Day_NumArt_sumRootCode.csv'
csvfilename = 'ar_data_ETH.csv'
# filepath relative to jupyter notebook
csvfilepath = '../data/'
csvfilepath = ''
df_1 = pd.read_csv(csvfilepath + csvfilename
,delimiter=','
#,skipinitialspace = True
#,encoding='latin_1'
,header=0
,index_col=False
)
# change format of datetime
df_1['Day'] = pd.to_datetime(df_1['Day'], format='%Y%m%d')
df_1 = df_1.drop(['Unnamed: 0'], axis=1)
df_1['day_of_week'] = df_1['Day'].dt.dayofweek
df_1['week'] = df_1['Day'].dt.week
print(df_1.dtypes)
print(df_1.head(9))
listofcolumns = df_1.columns
print(listofcolumns)
#print("shape: ", df_1.shape)
#df_1.plot(subplots=True, figsize=(20,25), grid=True)
df_1['19'].plot(subplots=True, figsize=(20,15), grid=True, style= 'k.')
def add_stats(df_stats, columnName):
# add rolling mean, min, max to each column of a given dataframe
# create window for calculating avg.
wwidth = 5 # window width
window2 = df_stats[columnName].rolling(window=wwidth) # shifted
# add statistical data
df_stats[columnName+'-avg'] = window2.mean()
df_stats[columnName+'-min'] = window2.min()
df_stats[columnName+'-max'] = window2.max()
columnNameShift=columnName+'-t-'+str(wwidth)
shifted = df_stats[columnName].shift(wwidth-1)
df_stats[columnNameShift]=df_stats[columnName].shift(wwidth-1)
# testing expanding window
window = df_stats[columnName].expanding()
return df_stats
def loop_columns(df_loop, exclude=[]):
# loop over all comumns and exclude the given list of columns
# df_loop: df to loop over columns
# exclude: list of column names to exclude from loops
# returns a df (which might be altered)
for (columnName, columnData) in df_loop.iteritems():
if not columnName in exclude:
# Do s.th. with this column, e.g. call a function
#print('Colunm Name : ', columnName)
df_loop = add_stats(df_loop, columnName)
#df = None
return df_loop
df_2 = df_1.copy()
df_2 = loop_columns(df_2, ["Day"])
df_2['month'] = [df_2.Day[i].month for i in range(len(df_2))]
df_2['day'] = [df_2.Day[i].day for i in range(len(df_2))]
df_2['year'] = [df_2.Day[i].year for i in range(len(df_2))]
df_2['day_diff'] = [(df_2.Day[i] - df_2.Day[0]).days for i in range(len(df_2))]
df_2['const']=1
# look at specific columns, everything's right?
print(df_2.iloc[:,:30].head())
#print(df_2.dtypes)
#print(df_2.columns)
# delete old df to free memory:
del df_1
# show python vars in buffer:
#%whos
from matplotlib import pyplot
import seaborn as sns
# make extra window for plot
#%matplotlib qt
%matplotlib inline
plot_column_y = "19"
plot_column_x = "Day"
ax=df_2.plot(x=plot_column_x, y=plot_column_y, color="red", figsize=(20,15), grid=False)
df_2.plot(x=plot_column_x, y=[plot_column_y+'-min', plot_column_y+'-max'], ax=ax, color="grey")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-t-5' ], ax=ax, color="green")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-avg' ], ax=ax, color="blue")
#df_2.plot(title=plot_column_y+" avg, min, max")
#df_2.plot.close()
pyplot.show()
# compare two datasets from the imported data:
%matplotlib inline
fig, ax1 = pyplot.subplots(figsize=(20,10))
# use twinx for 2 scales
ax2 = ax1#.twinx()
ax1.plot(df_2[plot_column_x], df_2[plot_column_y], 'g-')
ax2.plot(df_2[plot_column_x], df_2['total'], 'b-')
ax1.set_xlabel(plot_column_x)
ax1.set_ylabel(plot_column_y, color='g')
ax2.set_ylabel('total', color='b')
ax1.set(title=plot_column_y+" and total" )
pyplot.show()
# create a difference transform of the dataset
def difference(dataset):
diff = list()
for i in range(1, len(dataset)):
value = dataset[i] - dataset[i - 1]
diff.append(value)
return np.array(diff)
# calc diff and ln diff
df_2[plot_column_y+'-ln'] = np.log(df_2[plot_column_y])
df_2[plot_column_y+'-ln'+'-diff'] = df_2[plot_column_y+'-ln'].dropna().diff()
df_2[plot_column_y+'-diff'] = df_2[plot_column_y].diff()
df_2.head()
# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
# Levels
axes[0].plot(df_2.index._mpl_repr(),df_2[plot_column_y+'-diff'], '-')
axes[0].set(title='ln of '+plot_column_y)
# Log difference
axes[1].plot(df_2.index._mpl_repr(), df_2[plot_column_y+'-ln'+"-diff"], '-')
axes[1].hlines(0, df_2.index[0], df_2.index[-1], 'r')
axes[1].set(title='19 - difference of logs');
# plot with various axes scales
pyplot.figure(figsize=(20,9))
n_bins = 50
pyplot.subplot(221)
pyplot.plot(df_2['19-avg'] )
pyplot.yscale('linear')
pyplot.title('19 avg linear')
pyplot.grid(True)
pyplot.subplot(222)
pyplot.hist(df_2['19'],bins=n_bins )
pyplot.yscale('linear')
pyplot.title('19 linear hist')
pyplot.grid(True)
pyplot.subplot(223)
pyplot.plot(df_2['19-avg'])
pyplot.yscale('log')
pyplot.title('19 avg log')
pyplot.grid(True)
pyplot.subplot(224)
pyplot.hist(df_2['19'],bins=n_bins )
pyplot.yscale('log')
pyplot.title('19 log hist')
pyplot.grid(True)
pyplot.show()
print('Histogramms shows that the values do not have a gaussian distribution.\n\n')
sns.boxplot( y=df_2['19-avg']);
pyplot.show()
This plot draws a box around the 25th and 75th percentiles of the data that captures the middle 50% of observations. A line is drawn at the 50th percentile (the median) and whiskers are drawn above and below the box to summarize the general extents of the observations. Dots are drawn for outliers outside the whiskers or extents of the data.
pyplot.figure(figsize=(20,9))
n_bins = 50
pyplot.subplot(221)
pyplot.plot(np.sqrt(df_2['19-avg'].values))
pyplot.yscale('linear')
pyplot.title('19 avg sqrt')
pyplot.grid(True)
pyplot.subplot(222)
pyplot.hist(np.sqrt(df_2['19-avg'].values), bins=n_bins )
pyplot.yscale('linear')
pyplot.title('19 avg sqrt')
pyplot.grid(True)
pyplot.show()
from scipy.stats import boxcox
df_2['19-avg-boxcox'], lam = boxcox(df_2['19-avg'].values)
print( ' Lambda: %f ' % lam)
pyplot.figure(figsize=(20,9))
pyplot.subplot(221)
pyplot.plot(df_2['19-avg-boxcox'].values)
pyplot.yscale('linear')
pyplot.title('19 avg boxcox')
pyplot.grid(True)
pyplot.subplot(222)
pyplot.hist(df_2['19-avg-boxcox'].values, bins=50)
pyplot.yscale('linear')
pyplot.title('19 avg boxcox hist')
pyplot.grid(True)
pyplot.show()
import statsmodels.api as sm
# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
df_2.set_index('Day')
print(df_2.iloc[1:][plot_column_y+'-ln'+'-diff'])
#fig = sm.graphics.tsa.plot_acf(df_2[plot_column_y], lags=20, ax=axes[0])
#fig = sm.graphics.tsa.plot_pacf(df_2[plot_column_y+"-diff"], lags=150, ax=axes[1])
#print(df_2['19-avg-log'].dropna())
fig = sm.graphics.tsa.plot_acf(df_2['19-avg'].dropna(), lags=150, ax=axes[0])
fig = sm.graphics.tsa.plot_pacf(df_2['19-avg'].dropna(), lags=150, ax=axes[1])
A correlation value calculated between two groups of numbers, such as observations and their lag=1 values, results in a number between -1 and 1. The sign of this number indicates a negative or positive correlation respectively. A value close to zero suggests a weak correlation, whereas a value closer to -1 or 1 indicates a strong correlation.
from pandas.plotting import autocorrelation_plot
#print(len(df_2['19'].dropna())) # 1073
#print(len(df_2['19'].dropna())) # 1069
#df_3 = df_2[['Day', '19', '19-avg']].dropna()
#print(df_3.iloc[110:120,0:8])
#fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
#axes[0].plot(autocorrelation_plot(df_2['18']))
#axes[0]
pyplot.figure(figsize=(15, 6))
ax = autocorrelation_plot(df_2['19-avg'].dropna())
ax.set_xlim([0,120])
ax.set_ylim([-0.25,0.75])
df_2['19-avg'].dropna().autocorr()
#autocorrelation_plot(df_2['19-avg'].dropna())
pyplot.show()
Dotted lines are provided that indicate any correlation values above those lines are statistically significant (meaningful). ma
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df_2['19-avg'].dropna(), model= 'multiplicative', freq=365)
result.plot()
pyplot.show()
(takes 10 h)
import warnings
from math import sqrt
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = 'â–ˆ', printEnd = "\r"):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
# Print New Line on Complete
if iteration == total:
print()
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
# prepare training dataset
train_size = int(len(X) * 0.66)
train, test = X[0:train_size], X[train_size:]
history = [x for x in train]
# make predictions
predictions = list()
printProgressBar(0, len(test), prefix = 'Progress:', suffix = 'Complete', length = 80)
for t in range(len(test)):
printProgressBar(t, len(test), prefix = 'Progress:', suffix = 'Complete', length = 80)
model = ARIMA(endog=history, order=arima_order)
model_fit = model.fit(disp=0)
yhat = model_fit.forecast()[0]
predictions.append(yhat)
history.append(test[t])
# calculate out of sample error
rmse = sqrt(mean_squared_error(test, predictions))
return rmse
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
#dataset = dataset.astype( 'float32' )
best_score, best_cfg = float("inf"), None
i=1
for p in p_values:
for d in d_values:
for q in q_values:
order = (p,d,q)
try:
print(order)
rmse = evaluate_arima_model(dataset, order)
if rmse < best_score:
best_score, best_cfg = rmse, order
print( 'ARIMA%s RMSE=%.3f Round %s' % (order,rmse, i))
i+=1
except:
continue
print( ' Best ARIMA%s RMSE=%.3f ' % (best_cfg, best_score))
# load dataset
def parser(x):
return datetime.strptime( '190' +x, '%Y-%m' )
# data to fit
#columnpredict = "19-avg-log"
columnpredict = "19-avg"
series = df_2[columnpredict].astype('float64').dropna().values
# evaluate parameters
p_values = [9]# range(3, 4)
d_values = [1]#range(0, 3)
q_values = [4]#range(0, 4)
warnings.filterwarnings("ignore")
evaluate_models(series, p_values, d_values, q_values)
Best ARIMA(1, 0, 0) RMSE=173.122 for 19 ETH
Best ARIMA(2, 1, 2) RMSE=47.045 for 19-avg ETH
Best ARIMA(1, 1, 2) RMSE=422.358 for 19-avg ETH all data
Best ARIMA(4, 1, 2) RMSE=41.080 for 19-avg ETH all data
Best ARIMA(6, 1, 3) RMSE=39.331 for 19-avg ETH
Best ARIMA(7, 1, 3) RMSE=38.876
Best ARIMA(7, 1, 4) RMSE=36.079 for 19-avg ETH
Best ARIMA(9, 1, 4) RMSE=35.904 for 19-avg ETH
Best ARIMA(3, 0, 2) RMSE=0.252 for df_2['19-avg-log']
#from statsmodels.tsa.arima_model import AR
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import mean_squared_error
import numpy as np
# create a difference transform of the dataset
def difference(dataset):
diff = list()
for i in range(1, len(dataset)):
value = dataset[i] - dataset[i - 1]
diff.append(value)
return np.array(diff)
columnpredict = "19-avg"
series = df_2[columnpredict].astype('float64').dropna().values
model = AR(series)
model_fit = model.fit(maxlag=6, disp=False)
# should be the save an in DB query cell 1
country = "ETH"
# save model to file
model_fit.save('ar_model_'+country+'_'+columnpredict+'.pkl')
# save the differenced dataset
X = difference(series)
np.save('ar_data_'+country+'_'+columnpredict+'.npy', X)
# save the last ob
np.save('ar_obs_'+country+'_'+columnpredict+'.npy', [series[-1]])
from statsmodels.tsa.ar_model import ARResults
#from statsmodels.tsa.arima_model import ARIMAResults
country = 'ETH'
# load model
model = ARResults.load('ar_model_'+country+'_'+columnpredict+'.pkl')
#model = ARResults.load('ar_model_'+country+'_'+columnpredict+'.pkl')
data = np.load('ar_data_'+country+'_'+columnpredict+'.npy')
last_ob = np.load('ar_obs_'+country+'_'+columnpredict+'.npy')
print(len(data))
print(data)
print(last_ob)
# make prediction
predictions = model.predict(start=len(data), end=len(data))
print('Predicted difference: %f' % predictions)
# transform prediction
yhat = predictions[0] + last_ob[0]
print('Prediction: %f' % yhat)
from math import sqrt
# add a column with the future value
df_2[columnpredict+"+t1"] = df_2[columnpredict].shift(-1)
X = df_2[[columnpredict, columnpredict+"+t1"]].dropna().values
# split into train and test sets
train_size = int(len(X) * 0.66)
train, test = X[1:train_size], X[train_size:]
train_X, train_y = train[:,0], train[:,1]
test_X, test_y = test[:,0], test[:,1]
# persistence model
def model_persistence(x):
return x
# trained model
predictions_model = list()
def model_predict(x):
predictions = model.predict(x, x)
yhat = predictions[0] + x #last_ob[0]
predictions_model
return predictions
# walk-forward validation
predictions_persistence = list()
predictions_model = list()
i=0
for x in test_X:
#print(x)
yhat_persistence = model_persistence(x)
predictions_persistence.append(yhat_persistence)
yhat_model = model_predict(i+train_size)
predictions_model.append(yhat_model)
i+=1
rmse_persistence = sqrt(mean_squared_error(test_y, predictions_persistence))
rmse_model = sqrt(mean_squared_error(test_y, predictions_model))
print( ' Test RMSE persistence: %.3f ' % rmse_persistence)
print( ' Test RMSE model: %.3f ' % rmse_model)
%matplotlib inline
pyplot.figure(figsize=(18, 6))
pyplot.plot(train_y)
pyplot.plot([None for i in train_y] + [x for x in test_y], 'g-', label='test_y')
pyplot.plot([None for i in train_y] + [x for x in predictions_persistence], 'r-', label='prediction_pers')
pyplot.plot([None for i in train_y] + [x for x in predictions_model], 'y-', label='prediction_model')
pyplot.show()
import statsmodels.api as sm
import statsmodels
fig, ax = pyplot.subplots(figsize=(15, 6))
ax = df_2[columnpredict].plot(ax=ax)
fig = model.plot_predict(len(data),len(data)+20, dynamic=True, ax=ax, plot_insample=False)
pyplot.show()