The csv data is based on the following mysql query on the gdelt_events_country_ETH table.
The gdelt_events_country_ETH table is the fraction of the gdelt events which contain records associated with Ethiopia.
import numpy as np # load prev. data for prediction
import os # del files, sys exit
import pandas as pd
# get fresh data from database:
import mysql.connector
from mysql.connector import Error
def execute_query(query, cnx):
cursor = cnx.cursor(raw=True, buffered=True)
try:
cursor.execute(query)
result = cursor.fetchall()
cursor.close()
except Error as e:
print("sql query err")
#print(e)
errorstring = format(str(e.args[0]))
errornumber = errorstring[:4]
print(errornumber)
print(errorstring)
#result = "test"
return result
if 1 == 2:
mysql_config = {
'user': 'DB-user',
'password': 'DB-password',
'host': 'xxx.xxx.xxx.xxx',
'database': 'DB-name',
'raise_on_warnings': True,
'port': 'DB-port',
}
if 1 == 1:
mysql_config = {
'user': 'DB-user',
'password': 'DB-password',
'host': '127.0.0.1',
'database': 'DB-name',
'raise_on_warnings': True,
'port': '3306',
'ssl_disabled': True
}
try:
cnx = mysql.connector.connect(**mysql_config)
if cnx.is_connected():
#cursor = cnx.cursor(raw=True, buffered=True)
print('connected to datacoll 2') #
except Error as e:
print("mysql DB connection error")
#print("filename: " + filename + " row: " + str(row))
print(e)
print( "exception main: !! connection error in delete old")
print(format(str(e.args[0])))
os.sys.exit(1)
# print(db)
# the model will be for this country:
# CHE AUT DEU ER ETH SDN SYR USA RW
country = "ETH"
query_general = " select Day\
, sum(NumArticles) as 'total'\
, sum( if(EventRootCode = '01', NumArticles, 0) ) as '01'\
, sum( if(EventRootCode = '02', NumArticles, 0) ) as '02'\
, sum( if(EventRootCode = '03', NumArticles, 0) ) as '03'\
, sum( if(EventRootCode = '04', NumArticles, 0) ) as '04'\
, sum( if(EventRootCode = '05', NumArticles, 0) ) as '05'\
, sum( if(EventRootCode = '06', NumArticles, 0) ) as '06'\
, sum( if(EventRootCode = '07', NumArticles, 0) ) as '07'\
, sum( if(EventRootCode = '08', NumArticles, 0) ) as '08'\
, sum( if(EventRootCode = '09', NumArticles, 0) ) as '09'\
, sum( if(EventRootCode = '10', NumArticles, 0) ) as '10'\
, sum( if(EventRootCode = '11', NumArticles, 0) ) as '11'\
, sum( if(EventRootCode = '12', NumArticles, 0) ) as '12'\
, sum( if(EventRootCode = '13', NumArticles, 0) ) as '13'\
, sum( if(EventRootCode = '14', NumArticles, 0) ) as '14'\
, sum( if(EventRootCode = '15', NumArticles, 0) ) as '15'\
, sum( if(EventRootCode = '16', NumArticles, 0) ) as '16'\
, sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
, sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
, sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
, sum( if(EventRootCode = '20', NumArticles, 0) ) as '20'\
from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"
query_19 = " select Day\
, sum(NumArticles) as 'total'\
, sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
, sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
, sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"
columns_all = ["Day", "total", "17", "18", "19"]
#"select * from gdelt_events limit 2;"
result = execute_query(query_19, cnx)
#print(result)
column_day = []
column_list = []
df_1 = pd.DataFrame( columns=columns_all)
df_1 = df_1.fillna(0) # with 0s rather than NaNs
for col in range(len(columns_all)):
for item in result:
column_list.append(bytes(item[col]).decode())
df_1[columns_all[col]] = column_list
column_list = []
#df_1 = pd.DataFrame(column_day, columns = ['Day'])
#df_1['19'] = column_list
print(df_1.head())
#print(df_1.dtypes)
df_1.to_csv(r'ar_data_'+country+'.csv')
#np.save('ar_data_'+country+'.npy', column_list)
### Load data and convert date
Import cvs data as queried above. This is faster than database query when runnung it multiple times.
import pandas as pd
import numpy as np
#csvfilename = 'events_ETH_Day_NumArt_sumRootCode.csv'
csvfilename = 'ar_data_ETH.csv'
# filepath relative to jupyter notebook
csvfilepath = '../data/'
csvfilepath = ''
df_1 = pd.read_csv(csvfilepath + csvfilename
,delimiter=','
#,skipinitialspace = True
#,encoding='latin_1'
,header=0
,index_col=False
)
# change format of datetime
df_1['Day'] = pd.to_datetime(df_1['Day'], format='%Y%m%d')
df_1 = df_1.drop(['Unnamed: 0'], axis=1)
df_1['day_of_week'] = df_1['Day'].dt.dayofweek
df_1['week'] = df_1['Day'].dt.week
print(df_1.dtypes)
print(df_1.head(9))
listofcolumns = df_1.columns
print(listofcolumns)
#print("shape: ", df_1.shape)
#df_1.plot(subplots=True, figsize=(20,25), grid=True)
df_1['19'].plot(subplots=True, figsize=(20,15), grid=True, style= 'k.')
def add_stats(df_stats, columnName):
# add rolling mean, min, max to each column of a given dataframe
# create window for calculating avg.
wwidth = 5 # window width
window2 = df_stats[columnName].rolling(window=wwidth) # shifted
# add statistical data
df_stats[columnName+'-avg'] = window2.mean()
df_stats[columnName+'-min'] = window2.min()
df_stats[columnName+'-max'] = window2.max()
columnNameShift=columnName+'-t-'+str(wwidth)
shifted = df_stats[columnName].shift(wwidth-1)
df_stats[columnNameShift]=df_stats[columnName].shift(wwidth-1)
# testing expanding window
window = df_stats[columnName].expanding()
return df_stats
def loop_columns(df_loop, exclude=[]):
# loop over all comumns and exclude the given list of columns
# df_loop: df to loop over columns
# exclude: list of column names to exclude from loops
# returns a df (which might be altered)
for (columnName, columnData) in df_loop.iteritems():
if not columnName in exclude:
# Do s.th. with this column, e.g. call a function
#print('Colunm Name : ', columnName)
df_loop = add_stats(df_loop, columnName)
#df = None
return df_loop
df_2 = df_1.copy()
df_2 = loop_columns(df_2, ["Day"])
df_2['month'] = [df_2.Day[i].month for i in range(len(df_2))]
df_2['day'] = [df_2.Day[i].day for i in range(len(df_2))]
df_2['year'] = [df_2.Day[i].year for i in range(len(df_2))]
df_2['day_diff'] = [(df_2.Day[i] - df_2.Day[0]).days for i in range(len(df_2))]
df_2['const']=1
# look at specific columns, everything's right?
print(df_2.iloc[:,:30].head())
#print(df_2.dtypes)
#print(df_2.columns)
# delete old df to free memory:
del df_1
# show python vars in buffer:
#%whos
from matplotlib import pyplot
import seaborn as sns
# make extra window for plot
#%matplotlib qt
%matplotlib inline
plot_column_y = "19"
plot_column_x = "Day"
ax=df_2.plot(x=plot_column_x, y=plot_column_y, color="red", figsize=(20,15), grid=False)
df_2.plot(x=plot_column_x, y=[plot_column_y+'-min', plot_column_y+'-max'], ax=ax, color="grey")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-t-5' ], ax=ax, color="green")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-avg' ], ax=ax, color="blue")
#df_2.plot(title=plot_column_y+" avg, min, max")
#df_2.plot.close()
pyplot.show()
# compare two datasets from the imported data:
%matplotlib inline
fig, ax1 = pyplot.subplots(figsize=(20,10))
# use twinx for 2 scales
ax2 = ax1#.twinx()
ax1.plot(df_2[plot_column_x], df_2[plot_column_y], 'g-')
ax2.plot(df_2[plot_column_x], df_2['total'], 'b-')
ax1.set_xlabel(plot_column_x)
ax1.set_ylabel(plot_column_y, color='g')
ax2.set_ylabel('total', color='b')
ax1.set(title=plot_column_y+" and total" )
pyplot.show()
# create a difference transform of the dataset
def difference(dataset):
diff = list()
for i in range(1, len(dataset)):
value = dataset[i] - dataset[i - 1]
diff.append(value)
return np.array(diff)
# calc diff and ln diff
df_2[plot_column_y+'-ln'] = np.log(df_2[plot_column_y])
df_2[plot_column_y+'-ln'+'-diff'] = df_2[plot_column_y+'-ln'].dropna().diff()
df_2[plot_column_y+'-diff'] = df_2[plot_column_y].diff()
df_2.head()
# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
# Levels
axes[0].plot(df_2.index._mpl_repr(),df_2[plot_column_y+'-diff'], '-')
axes[0].set(title='ln of '+plot_column_y)
# Log difference
axes[1].plot(df_2.index._mpl_repr(), df_2[plot_column_y+'-ln'+"-diff"], '-')
axes[1].hlines(0, df_2.index[0], df_2.index[-1], 'r')
axes[1].set(title='19 - difference of logs');
# plot with various axes scales
pyplot.figure(figsize=(20,9))
n_bins = 50
pyplot.subplot(221)
pyplot.plot(df_2['19-avg'] )
pyplot.yscale('linear')
pyplot.title('19 avg linear')
pyplot.grid(True)
pyplot.subplot(222)
pyplot.hist(df_2['19'],bins=n_bins )
pyplot.yscale('linear')
pyplot.title('19 linear hist')
pyplot.grid(True)
pyplot.subplot(223)
pyplot.plot(df_2['19-avg'])
pyplot.yscale('log')
pyplot.title('19 avg log')
pyplot.grid(True)
pyplot.subplot(224)
pyplot.hist(df_2['19'],bins=n_bins )
pyplot.yscale('log')
pyplot.title('19 log hist')
pyplot.grid(True)
pyplot.show()
print('Histogramms shows that the values do not have a gaussian distribution.\n\n')
sns.boxplot( y=df_2['19-avg']);
pyplot.show()
This plot draws a box around the 25th and 75th percentiles of the data that captures the middle 50% of observations. A line is drawn at the 50th percentile (the median) and whiskers are drawn above and below the box to summarize the general extents of the observations. Dots are drawn for outliers outside the whiskers or extents of the data.
pyplot.figure(figsize=(20,9))
n_bins = 50
pyplot.subplot(221)
pyplot.plot(np.sqrt(df_2['19-avg'].values))
pyplot.yscale('linear')
pyplot.title('19 avg sqrt')
pyplot.grid(True)
pyplot.subplot(222)
pyplot.hist(np.sqrt(df_2['19-avg'].values), bins=n_bins )
pyplot.yscale('linear')
pyplot.title('19 avg sqrt')
pyplot.grid(True)
pyplot.show()
from scipy.stats import boxcox
df_2['19-avg-boxcox'], lam = boxcox(df_2['19-avg'].values)
print( ' Lambda: %f ' % lam)
pyplot.figure(figsize=(20,9))
pyplot.subplot(221)
pyplot.plot(df_2['19-avg-boxcox'].values)
pyplot.yscale('linear')
pyplot.title('19 avg boxcox')
pyplot.grid(True)
pyplot.subplot(222)
pyplot.hist(df_2['19-avg-boxcox'].values, bins=50)
pyplot.yscale('linear')
pyplot.title('19 avg boxcox hist')
pyplot.grid(True)
pyplot.show()
import statsmodels.api as sm
# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
df_2.set_index('Day')
print(df_2.iloc[1:][plot_column_y+'-ln'+'-diff'])
#fig = sm.graphics.tsa.plot_acf(df_2[plot_column_y], lags=20, ax=axes[0])
#fig = sm.graphics.tsa.plot_pacf(df_2[plot_column_y+"-diff"], lags=150, ax=axes[1])
#print(df_2['19-avg-log'].dropna())
fig = sm.graphics.tsa.plot_acf(df_2['19-avg'].dropna(), lags=150, ax=axes[0])
fig = sm.graphics.tsa.plot_pacf(df_2['19-avg'].dropna(), lags=150, ax=axes[1])
A correlation value calculated between two groups of numbers, such as observations and their lag=1 values, results in a number between -1 and 1. The sign of this number indicates a negative or positive correlation respectively. A value close to zero suggests a weak correlation, whereas a value closer to -1 or 1 indicates a strong correlation.
from pandas.plotting import autocorrelation_plot
#print(len(df_2['19'].dropna())) # 1073
#print(len(df_2['19'].dropna())) # 1069
#df_3 = df_2[['Day', '19', '19-avg']].dropna()
#print(df_3.iloc[110:120,0:8])
#fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
#axes[0].plot(autocorrelation_plot(df_2['18']))
#axes[0]
pyplot.figure(figsize=(15, 6))
ax = autocorrelation_plot(df_2['19-avg'].dropna())
ax.set_xlim([0,120])
ax.set_ylim([-0.25,0.75])
df_2['19-avg'].dropna().autocorr()
#autocorrelation_plot(df_2['19-avg'].dropna())
pyplot.show()
Dotted lines are provided that indicate any correlation values above those lines are statistically significant (meaningful). ma
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df_2['19-avg'].dropna(), model= 'multiplicative', freq=365)
result.plot()
pyplot.show()