Events country ETH time series forecasting

Get data from Database

The csv data is based on the following mysql query on the gdelt_events_country_ETH table.

The gdelt_events_country_ETH table is the fraction of the gdelt events which contain records associated with Ethiopia.

In [3]:
import numpy as np # load prev. data for prediction
import os # del files, sys exit

import pandas as pd

# get fresh data from database:
import mysql.connector
from mysql.connector import Error

def execute_query(query, cnx):
    cursor = cnx.cursor(raw=True, buffered=True)
    try:
        cursor.execute(query)
        result = cursor.fetchall()
        cursor.close()
    except Error as e:
        print("sql query err")
        #print(e)
        errorstring = format(str(e.args[0]))
        errornumber = errorstring[:4]
        print(errornumber)
        print(errorstring)

    #result = "test"
    return result

if 1 == 2:
    mysql_config = {
            'user': 'DB-user',
            'password': 'DB-password',
            'host': 'xxx.xxx.xxx.xxx',
            'database': 'DB-name',
            'raise_on_warnings': True,
            'port': 'DB-port',
        }

if 1 == 1:
    mysql_config = {
            'user': 'DB-user',
            'password': 'DB-password',
            'host': '127.0.0.1',
            'database': 'DB-name',
            'raise_on_warnings': True,
            'port': '3306',
            'ssl_disabled': True
        }

try:    
    cnx = mysql.connector.connect(**mysql_config)
    if cnx.is_connected():
        #cursor = cnx.cursor(raw=True, buffered=True)
        print('connected to datacoll 2')  #

except Error as e:
    print("mysql DB connection error")
    #print("filename: " + filename + " row: " + str(row))
    print(e)
    print( "exception main: !! connection error in delete old")
    print(format(str(e.args[0])))
    os.sys.exit(1)



# print(db)
# the model will be for this country:
# CHE AUT DEU ER ETH SDN SYR USA RW
country = "ETH"

query_general = " select Day\
    , sum(NumArticles)  as 'total'\
    , sum( if(EventRootCode = '01', NumArticles, 0) ) as '01'\
	, sum( if(EventRootCode = '02', NumArticles, 0) ) as '02'\
    , sum( if(EventRootCode = '03', NumArticles, 0) ) as '03'\
	, sum( if(EventRootCode = '04', NumArticles, 0) ) as '04'\
	, sum( if(EventRootCode = '05', NumArticles, 0) ) as '05'\
	, sum( if(EventRootCode = '06', NumArticles, 0) ) as '06'\
	, sum( if(EventRootCode = '07', NumArticles, 0) ) as '07'\
	, sum( if(EventRootCode = '08', NumArticles, 0) ) as '08'\
	, sum( if(EventRootCode = '09', NumArticles, 0) ) as '09'\
	, sum( if(EventRootCode = '10', NumArticles, 0) ) as '10'\
	, sum( if(EventRootCode = '11', NumArticles, 0) ) as '11'\
	, sum( if(EventRootCode = '12', NumArticles, 0) ) as '12'\
	, sum( if(EventRootCode = '13', NumArticles, 0) ) as '13'\
	, sum( if(EventRootCode = '14', NumArticles, 0) ) as '14'\
	, sum( if(EventRootCode = '15', NumArticles, 0) ) as '15'\
	, sum( if(EventRootCode = '16', NumArticles, 0) ) as '16'\
	, sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
	, sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
	, sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
    , sum( if(EventRootCode = '20', NumArticles, 0) ) as '20'\
    from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"

query_19 = " select Day\
    , sum(NumArticles)  as 'total'\
    , sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
    , sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
    , sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
    from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"

columns_all = ["Day", "total", "17", "18", "19"]
#"select * from gdelt_events limit 2;"
result = execute_query(query_19, cnx)
#print(result)
column_day = []
column_list = []

df_1 = pd.DataFrame( columns=columns_all)
df_1 = df_1.fillna(0) # with 0s rather than NaNs

for col in range(len(columns_all)):    
    for item in result:        
        column_list.append(bytes(item[col]).decode())
        
    df_1[columns_all[col]] = column_list    
    column_list = []


#df_1 = pd.DataFrame(column_day, columns = ['Day'])
#df_1['19'] = column_list
print(df_1.head())
#print(df_1.dtypes)
df_1.to_csv(r'ar_data_'+country+'.csv')
#np.save('ar_data_'+country+'.npy', column_list)

### Load data and convert date
connected to datacoll 2
        Day total   17  18  19
0  20170101   679   77  15  49
1  20170102  1670   54  12  73
2  20170103  1659  110   0  90
3  20170104  2038  132  10  32
4  20170105  1570  125  15  96

Get data from csv file

Import cvs data as queried above. This is faster than database query when runnung it multiple times.

In [6]:
import pandas as pd
import numpy as np

#csvfilename = 'events_ETH_Day_NumArt_sumRootCode.csv'
csvfilename = 'ar_data_ETH.csv'

# filepath relative to jupyter notebook
csvfilepath = '../data/'
csvfilepath = ''

df_1 = pd.read_csv(csvfilepath + csvfilename                   
                   ,delimiter=','
                   #,skipinitialspace = True
                   #,encoding='latin_1'
                   ,header=0
                   ,index_col=False 
                  )

# change format of datetime
df_1['Day'] = pd.to_datetime(df_1['Day'], format='%Y%m%d')
df_1 = df_1.drop(['Unnamed: 0'], axis=1)
df_1['day_of_week'] = df_1['Day'].dt.dayofweek
df_1['week'] = df_1['Day'].dt.week
print(df_1.dtypes)
print(df_1.head(9))
listofcolumns = df_1.columns
print(listofcolumns)
#print("shape: ", df_1.shape)
#df_1.plot(subplots=True, figsize=(20,25), grid=True)
df_1['19'].plot(subplots=True, figsize=(20,15), grid=True, style= 'k.')
Day            datetime64[ns]
total                   int64
17                      int64
18                      int64
19                      int64
day_of_week             int64
week                    int64
dtype: object
         Day  total   17  18   19  day_of_week  week
0 2017-01-01    679   77  15   49            6    52
1 2017-01-02   1670   54  12   73            0     1
2 2017-01-03   1659  110   0   90            1     1
3 2017-01-04   2038  132  10   32            2     1
4 2017-01-05   1570  125  15   96            3     1
5 2017-01-06   2806  121  30  289            4     1
6 2017-01-07   1523    8  10  163            5     1
7 2017-01-08   1099    9  11   92            6     1
8 2017-01-09   1588   34  26   49            0     2
Index(['Day', 'total', '17', '18', '19', 'day_of_week', 'week'], dtype='object')
Out[6]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fdb90532910>],
      dtype=object)

Add basic statistic properties

In [7]:
def add_stats(df_stats, columnName):
    # add rolling mean, min, max to each column of a given dataframe    
    
    # create window for calculating avg.
    wwidth = 5 # window width
    window2 = df_stats[columnName].rolling(window=wwidth) # shifted
    # add statistical data
    df_stats[columnName+'-avg'] = window2.mean()
    df_stats[columnName+'-min'] = window2.min()
    df_stats[columnName+'-max'] = window2.max()

    columnNameShift=columnName+'-t-'+str(wwidth)
    shifted = df_stats[columnName].shift(wwidth-1)
    df_stats[columnNameShift]=df_stats[columnName].shift(wwidth-1)
    # testing expanding window
    window = df_stats[columnName].expanding()
    return df_stats


def loop_columns(df_loop, exclude=[]):
    # loop over all comumns and exclude the given list of columns
    # df_loop: df to loop over columns
    # exclude: list of column names to exclude from loops
    # returns a df (which might be altered)
    
    for (columnName, columnData) in df_loop.iteritems():
        if not columnName in exclude:
            # Do s.th. with this column, e.g. call a function
            #print('Colunm Name : ', columnName)
            df_loop = add_stats(df_loop, columnName)
    
    #df = None 
    return df_loop


df_2 = df_1.copy()
df_2 = loop_columns(df_2, ["Day"])

df_2['month'] = [df_2.Day[i].month for i in range(len(df_2))]
df_2['day'] = [df_2.Day[i].day for i in range(len(df_2))]
df_2['year'] = [df_2.Day[i].year for i in range(len(df_2))]
df_2['day_diff'] = [(df_2.Day[i] - df_2.Day[0]).days for i in range(len(df_2))]
df_2['const']=1

# look at specific columns, everything's right?
print(df_2.iloc[:,:30].head())
#print(df_2.dtypes)
#print(df_2.columns)

# delete old df to free memory:
del df_1

# show python vars in buffer:
#%whos
         Day  total   17  18  19  day_of_week  week  total-avg  total-min  \
0 2017-01-01    679   77  15  49            6    52        NaN        NaN   
1 2017-01-02   1670   54  12  73            0     1        NaN        NaN   
2 2017-01-03   1659  110   0  90            1     1        NaN        NaN   
3 2017-01-04   2038  132  10  32            2     1        NaN        NaN   
4 2017-01-05   1570  125  15  96            3     1     1523.2      679.0   

   total-max  ...  19-min  19-max  19-t-5  day_of_week-avg  day_of_week-min  \
0        NaN  ...     NaN     NaN     NaN              NaN              NaN   
1        NaN  ...     NaN     NaN     NaN              NaN              NaN   
2        NaN  ...     NaN     NaN     NaN              NaN              NaN   
3        NaN  ...     NaN     NaN     NaN              NaN              NaN   
4     2038.0  ...    32.0    96.0    49.0              2.4              0.0   

   day_of_week-max  day_of_week-t-5  week-avg  week-min  week-max  
0              NaN              NaN       NaN       NaN       NaN  
1              NaN              NaN       NaN       NaN       NaN  
2              NaN              NaN       NaN       NaN       NaN  
3              NaN              NaN       NaN       NaN       NaN  
4              6.0              6.0      11.2       1.0      52.0  

[5 rows x 30 columns]

Plot data to understand

Plot basic stats

In [8]:
from matplotlib import pyplot
import seaborn as sns

# make extra window for plot
#%matplotlib qt
%matplotlib inline


plot_column_y = "19"
plot_column_x = "Day"
ax=df_2.plot(x=plot_column_x, y=plot_column_y, color="red", figsize=(20,15), grid=False)
df_2.plot(x=plot_column_x, y=[plot_column_y+'-min', plot_column_y+'-max'], ax=ax, color="grey")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-t-5' ], ax=ax, color="green")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-avg' ], ax=ax, color="blue")
#df_2.plot(title=plot_column_y+" avg, min, max")
#df_2.plot.close()
pyplot.show()

Compare datasets

In [9]:
# compare two datasets from the imported data:
%matplotlib inline

fig, ax1 = pyplot.subplots(figsize=(20,10))
# use twinx for 2 scales
ax2 = ax1#.twinx()

ax1.plot(df_2[plot_column_x], df_2[plot_column_y], 'g-')
ax2.plot(df_2[plot_column_x], df_2['total'], 'b-')
ax1.set_xlabel(plot_column_x)
ax1.set_ylabel(plot_column_y, color='g')
ax2.set_ylabel('total', color='b')
ax1.set(title=plot_column_y+" and total" )

pyplot.show()

Difference and log Diff

In [10]:
# create a difference transform of the dataset
def difference(dataset):
	diff = list()
	for i in range(1, len(dataset)):
		value = dataset[i] - dataset[i - 1]
		diff.append(value)
	return np.array(diff)


# calc diff and ln diff
df_2[plot_column_y+'-ln'] = np.log(df_2[plot_column_y])
df_2[plot_column_y+'-ln'+'-diff'] = df_2[plot_column_y+'-ln'].dropna().diff()
df_2[plot_column_y+'-diff'] = df_2[plot_column_y].diff()
df_2.head()

# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))

# Levels
axes[0].plot(df_2.index._mpl_repr(),df_2[plot_column_y+'-diff'], '-')
axes[0].set(title='ln of '+plot_column_y)

# Log difference
axes[1].plot(df_2.index._mpl_repr(), df_2[plot_column_y+'-ln'+"-diff"], '-')
axes[1].hlines(0, df_2.index[0], df_2.index[-1], 'r')
axes[1].set(title='19 - difference of logs');
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/pandas/core/series.py:853: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)

Histogramm

In [11]:
# plot with various axes scales
pyplot.figure(figsize=(20,9))

n_bins = 50
pyplot.subplot(221)
pyplot.plot(df_2['19-avg'] )
pyplot.yscale('linear')
pyplot.title('19 avg linear')
pyplot.grid(True)

pyplot.subplot(222)
pyplot.hist(df_2['19'],bins=n_bins )
pyplot.yscale('linear')
pyplot.title('19 linear hist')
pyplot.grid(True)

pyplot.subplot(223)
pyplot.plot(df_2['19-avg'])
pyplot.yscale('log')
pyplot.title('19 avg log')
pyplot.grid(True)

pyplot.subplot(224)
pyplot.hist(df_2['19'],bins=n_bins )
pyplot.yscale('log')
pyplot.title('19 log hist')
pyplot.grid(True)

pyplot.show()

print('Histogramms shows that the values do not have a gaussian distribution.\n\n')
sns.boxplot( y=df_2['19-avg']);
pyplot.show()
Histogramms shows that the values do not have a gaussian distribution.


This plot draws a box around the 25th and 75th percentiles of the data that captures the middle 50% of observations. A line is drawn at the 50th percentile (the median) and whiskers are drawn above and below the box to summarize the general extents of the observations. Dots are drawn for outliers outside the whiskers or extents of the data.

Square root transform

In [12]:
pyplot.figure(figsize=(20,9))

n_bins = 50
pyplot.subplot(221)
pyplot.plot(np.sqrt(df_2['19-avg'].values))
pyplot.yscale('linear')
pyplot.title('19 avg sqrt')
pyplot.grid(True)

pyplot.subplot(222)
pyplot.hist(np.sqrt(df_2['19-avg'].values), bins=n_bins )
pyplot.yscale('linear')
pyplot.title('19 avg sqrt')
pyplot.grid(True)

pyplot.show()
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/numpy/lib/histograms.py:839: RuntimeWarning: invalid value encountered in greater_equal
  keep = (tmp_a >= first_edge)
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/numpy/lib/histograms.py:840: RuntimeWarning: invalid value encountered in less_equal
  keep &= (tmp_a <= last_edge)

boxcox transform

In [13]:
from scipy.stats import boxcox

df_2['19-avg-boxcox'], lam = boxcox(df_2['19-avg'].values)
print( ' Lambda: %f ' % lam)

pyplot.figure(figsize=(20,9))

pyplot.subplot(221)
pyplot.plot(df_2['19-avg-boxcox'].values)
pyplot.yscale('linear')
pyplot.title('19 avg boxcox')
pyplot.grid(True)

pyplot.subplot(222)
pyplot.hist(df_2['19-avg-boxcox'].values, bins=50)
pyplot.yscale('linear')
pyplot.title('19 avg boxcox hist')
pyplot.grid(True)

pyplot.show()
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/scipy/stats/morestats.py:1038: RuntimeWarning: invalid value encountered in less_equal
  if any(x <= 0):
 Lambda: 8.472136 

Autocorrelation (acf) and partial Autocorrelation (pacf)

In [14]:
import statsmodels.api as sm
# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
df_2.set_index('Day')
print(df_2.iloc[1:][plot_column_y+'-ln'+'-diff'])
#fig = sm.graphics.tsa.plot_acf(df_2[plot_column_y], lags=20, ax=axes[0])
#fig = sm.graphics.tsa.plot_pacf(df_2[plot_column_y+"-diff"], lags=150, ax=axes[1])

#print(df_2['19-avg-log'].dropna())

fig = sm.graphics.tsa.plot_acf(df_2['19-avg'].dropna(), lags=150, ax=axes[0])
fig = sm.graphics.tsa.plot_pacf(df_2['19-avg'].dropna(), lags=150, ax=axes[1])
1       0.398639
2       0.209350
3      -1.034074
4       1.098612
5       1.102078
          ...   
1068    0.609064
1069   -0.747214
1070   -0.523248
1071    1.571217
1072    0.731368
Name: 19-ln-diff, Length: 1072, dtype: float64
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/statsmodels/regression/linear_model.py:1358: RuntimeWarning: invalid value encountered in sqrt
  return rho, np.sqrt(sigmasq)

Autocorrelation

A correlation value calculated between two groups of numbers, such as observations and their lag=1 values, results in a number between -1 and 1. The sign of this number indicates a negative or positive correlation respectively. A value close to zero suggests a weak correlation, whereas a value closer to -1 or 1 indicates a strong correlation.

In [15]:
from pandas.plotting import autocorrelation_plot


#print(len(df_2['19'].dropna())) # 1073
#print(len(df_2['19'].dropna())) # 1069
#df_3 = df_2[['Day', '19', '19-avg']].dropna()
#print(df_3.iloc[110:120,0:8])
#fig, axes = pyplot.subplots(1, 2, figsize=(25,4))

#axes[0].plot(autocorrelation_plot(df_2['18']))
#axes[0]
pyplot.figure(figsize=(15, 6))
ax = autocorrelation_plot(df_2['19-avg'].dropna())
ax.set_xlim([0,120])
ax.set_ylim([-0.25,0.75])
df_2['19-avg'].dropna().autocorr()
#autocorrelation_plot(df_2['19-avg'].dropna())
pyplot.show()

Dotted lines are provided that indicate any correlation values above those lines are statistically significant (meaningful). ma

Time Series Decomposition

In [22]:
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(df_2['19-avg'].dropna(), model= 'multiplicative', freq=365)
result.plot()
pyplot.show()