Events country ETH time series forecasting

Get data from Database

The csv data is based on the following mysql query on the gdeld_events_country_ETH table

In [2]:
import numpy as np # load prev. data for prediction
import os # del files, sys exit

import pandas as pd

# get fresh data from database:
import mysql.connector
from mysql.connector import Error

def execute_query(query, cnx):
    cursor = cnx.cursor(raw=True, buffered=True)
    try:
        cursor.execute(query)
        result = cursor.fetchall()
        cursor.close()
    except Error as e:
        print("sql query err")
        #print(e)
        errorstring = format(str(e.args[0]))
        errornumber = errorstring[:4]
        print(errornumber)
        print(errorstring)

    #result = "test"
    return result

if 1 == 2:
    mysql_config = {
            'user': 'DB-user',
            'password': 'DB-password',
            'host': 'DB-server',
            'database': 'datacoll',
            'raise_on_warnings': True,
            'port': '45506',
        }

if 1 == 1:
    mysql_config = {
            'user': 'DB-user',
            'password': 'DB-password',
            'host': '127.0.0.1',
            'database': 'datacoll',
            'raise_on_warnings': True,
            'port': '3306',
            'ssl_disabled': True
        }

try:    
    cnx = mysql.connector.connect(**mysql_config)
    if cnx.is_connected():
        #cursor = cnx.cursor(raw=True, buffered=True)
        print('connected to datacoll 2')  #

except Error as e:
    print("mysql DB connection error")
    #print("filename: " + filename + " row: " + str(row))
    print(e)
    print( "exception main: !! connection error in delete old")
    print(format(str(e.args[0])))
    os.sys.exit(1)



# print(db)
# the model will be for this country:
# CHE AUT DEU ER ETH SDN SYR USA RW
country = "ETH"

query_general = " select Day\
    , sum(NumArticles)  as 'total'\
    , sum( if(EventRootCode = '01', NumArticles, 0) ) as '01'\
	, sum( if(EventRootCode = '02', NumArticles, 0) ) as '02'\
    , sum( if(EventRootCode = '03', NumArticles, 0) ) as '03'\
	, sum( if(EventRootCode = '04', NumArticles, 0) ) as '04'\
	, sum( if(EventRootCode = '05', NumArticles, 0) ) as '05'\
	, sum( if(EventRootCode = '06', NumArticles, 0) ) as '06'\
	, sum( if(EventRootCode = '07', NumArticles, 0) ) as '07'\
	, sum( if(EventRootCode = '08', NumArticles, 0) ) as '08'\
	, sum( if(EventRootCode = '09', NumArticles, 0) ) as '09'\
	, sum( if(EventRootCode = '10', NumArticles, 0) ) as '10'\
	, sum( if(EventRootCode = '11', NumArticles, 0) ) as '11'\
	, sum( if(EventRootCode = '12', NumArticles, 0) ) as '12'\
	, sum( if(EventRootCode = '13', NumArticles, 0) ) as '13'\
	, sum( if(EventRootCode = '14', NumArticles, 0) ) as '14'\
	, sum( if(EventRootCode = '15', NumArticles, 0) ) as '15'\
	, sum( if(EventRootCode = '16', NumArticles, 0) ) as '16'\
	, sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
	, sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
	, sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
    , sum( if(EventRootCode = '20', NumArticles, 0) ) as '20'\
    from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"

query_19 = " select Day\
    , sum(NumArticles)  as 'total'\
    , sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
    , sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
    , sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
    from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"

columns_19 = ["Day", "total", "17", "18", "19"]
columns_general = ["Day", "total", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20"]
#"select * from gdelt_events limit 2;"
result = execute_query(query_general, cnx)
#print(result)
column_day = []
column_list = []

df_1 = pd.DataFrame( columns=columns_general)
df_1 = df_1.fillna(0) # with 0s rather than NaNs

for col in range(len(columns_general)):
    #print(col, columns_general[col])
    for item in result:        
        column_list.append(bytes(item[col]).decode())
        
    df_1[columns_general[col]] = column_list
    #print(df_1.head())
    column_list = []

#print(column_day)
#pd.DataFrame() 
#column_list = np.asarray(column_list)
#column_list = column_list.astype(int)
#print(column_list)

#df_1 = pd.DataFrame(column_day, columns = ['Day'])
#df_1['19'] = column_list
print(df_1.head())
#print(df_1.dtypes)
df_1.to_csv(r'ar_data_'+country+'.csv')
#np.save('ar_data_'+country+'.npy', column_list)

### Load data and convert date
connected to datacoll 2
        Day total   01   02   03   04   05  06   07   08  ...   11  12  13  \
0  20170101   679   62   31   59  154  106   0   57   17  ...   31  10   1   
1  20170102  1670  197   52  186  305  320  22  163   49  ...   91  52  34   
2  20170103  1659  219  144   77  285  141  34  114   86  ...  230  32   4   
3  20170104  2038  248  136   87  560  131  54  144  171  ...  248  34   0   
4  20170105  1570  140   66  116  475  160  51   60  101  ...   85  47  20   

   14 15 16   17  18  19 20  
0   0  0  0   77  15  49  0  
1  10  0  0   54  12  73  0  
2  31  0  0  110   0  90  0  
3  22  0  0  132  10  32  0  
4   0  0  0  125  15  96  0  

[5 rows x 22 columns]

The columns 01- 20 depict the gdelt root event code like "fight" for 19 or "coerce" for 17.

See also Column definition: column EventRootCode on page https://mind-node.net/gdelt-event-2-0-cheat-sheet/ or the Cameo event table on https://mind-node.net/cameo-event-codes/ An example usage on page: https://mind-node.net/country-dashboard/ The original description is from the gdelt project: http://data.gdeltproject.org/documentation/CAMEO.Manual.1.1b3.pdf

I save the sql result in a csv file and load the for repeated runs as this is faster.

In [3]:
import pandas as pd
import numpy as np

#csvfilename = 'events_ETH_Day_NumArt_sumRootCode.csv'
csvfilename = 'ar_data_ETH.csv'

# filepath relative to jupyter notebook
csvfilepath = '../data/'
csvfilepath = ''

df_1 = pd.read_csv(csvfilepath + csvfilename                   
                   ,delimiter=','
                   #,skipinitialspace = True
                   #,encoding='latin_1'
                   ,header=0
                   ,index_col=False                  
                   #converters = {"account2no":conv_i})
                   #,dtype={"Kategorie CIP":'str'}
                  )

# change format of datetime
df_1['Day'] = pd.to_datetime(df_1['Day'], format='%Y%m%d')
df_1 = df_1.drop(['Unnamed: 0'], axis=1)
df_1['day_of_week'] = df_1['Day'].dt.dayofweek
df_1['week'] = df_1['Day'].dt.week
print(df_1.dtypes)
print(df_1.head(9))
listofcolumns = df_1.columns
print(listofcolumns)
print("shape: ", df_1.shape)
#df_1.plot(subplots=True, figsize=(20,25), grid=True)
df_1['19'].plot(subplots=True, figsize=(20,15), grid=True, style= 'k.')
df_1['19'].describe()
Day            datetime64[ns]
total                   int64
01                      int64
02                      int64
03                      int64
04                      int64
05                      int64
06                      int64
07                      int64
08                      int64
09                      int64
10                      int64
11                      int64
12                      int64
13                      int64
14                      int64
15                      int64
16                      int64
17                      int64
18                      int64
19                      int64
20                      int64
day_of_week             int64
week                    int64
dtype: object
         Day  total   01   02   03   04   05  06   07   08  ...  13  14  15  \
0 2017-01-01    679   62   31   59  154  106   0   57   17  ...   1   0   0   
1 2017-01-02   1670  197   52  186  305  320  22  163   49  ...  34  10   0   
2 2017-01-03   1659  219  144   77  285  141  34  114   86  ...   4  31   0   
3 2017-01-04   2038  248  136   87  560  131  54  144  171  ...   0  22   0   
4 2017-01-05   1570  140   66  116  475  160  51   60  101  ...  20   0   0   
5 2017-01-06   2806  300  136  206  373  116  67  840   90  ...   0  20   0   
6 2017-01-07   1523  103   54   26  166  122  26  676   34  ...  10   0   0   
7 2017-01-08   1099   93   91  129  341   55  30  108   57  ...   0  14   0   
8 2017-01-09   1588  181   78  141  569  137  23  119   53  ...  20  40   4   

   16   17  18   19  20  day_of_week  week  
0   0   77  15   49   0            6    52  
1   0   54  12   73   0            0     1  
2   0  110   0   90   0            1     1  
3   0  132  10   32   0            2     1  
4   0  125  15   96   0            3     1  
5  13  121  30  289   0            4     1  
6  12    8  10  163   0            5     1  
7   0    9  11   92   0            6     1  
8  10   34  26   49   0            0     2  

[9 rows x 24 columns]
Index(['Day', 'total', '01', '02', '03', '04', '05', '06', '07', '08', '09',
       '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
       'day_of_week', 'week'],
      dtype='object')
shape:  (1073, 24)
Out[3]:
count    1073.000000
mean      117.153774
std       163.681308
min         0.000000
25%        47.000000
50%        78.000000
75%       131.000000
max      2166.000000
Name: 19, dtype: float64

Add basic statistic properties

In [4]:
def add_stats(df_stats, columnName):
    # add rolling mean, min, max to each column of a given dataframe    
    
    # create window for calculating avg.
    wwidth = 5 # window width
    window2 = df_stats[columnName].rolling(window=wwidth) # shifted
    # add statistical data
    df_stats[columnName+'-avg'] = window2.mean()
    df_stats[columnName+'-min'] = window2.min()
    df_stats[columnName+'-max'] = window2.max()

    columnNameShift=columnName+'-t-'+str(wwidth)
    shifted = df_stats[columnName].shift(wwidth-1)
    df_stats[columnNameShift]=df_stats[columnName].shift(wwidth-1)
    # testing expanding window
    window = df_stats[columnName].expanding()
    return df_stats


def loop_columns(df_loop, exclude=[]):
    # loop over all comumns and exclude the given list of columns
    # df_loop: df to loop over columns
    # exclude: list of column names to exclude from loops
    # returns a df (which might be altered)
    
    for (columnName, columnData) in df_loop.iteritems():
        if not columnName in exclude:
            # Do s.th. with this column, e.g. call a function
            #print('Colunm Name : ', columnName)
            df_loop = add_stats(df_loop, columnName)
    
    #df = None 
    return df_loop


df_2 = df_1.copy()
df_2 = loop_columns(df_2, ["Day", "day_of_week", "week"])

df_2['month'] = [df_2.Day[i].month for i in range(len(df_2))]
df_2['day'] = [df_2.Day[i].day for i in range(len(df_2))]
df_2['year'] = [df_2.Day[i].year for i in range(len(df_2))]
df_2['day_diff'] = [(df_2.Day[i] - df_2.Day[0]).days for i in range(len(df_2))]
df_2['const']=1

# look at specific columns, everything's right?
print(df_2.iloc[:,:].head().dropna())
#print(df_2.dtypes)


# delete old df to free memory:
#del df_1

# show python vars in buffer:
#%whos
df_2['19'].describe()
         Day  total   01  02   03   04   05  06  07   08  ...  19-t-5  20-avg  \
4 2017-01-05   1570  140  66  116  475  160  51  60  101  ...    49.0     0.0   

   20-min  20-max  20-t-5  month  day  year  day_diff  const  
4     0.0     0.0     0.0      1    5  2017         4      1  

[1 rows x 113 columns]
Out[4]:
count    1073.000000
mean      117.153774
std       163.681308
min         0.000000
25%        47.000000
50%        78.000000
75%       131.000000
max      2166.000000
Name: 19, dtype: float64

Granger Causality

I am interested in predicting column 19 "fight". To see which data can help me to predict this i test the granger causality for other columns.

In [5]:
# example from statsmodel
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
import numpy as np
data = sm.datasets.macrodata.load_pandas()

data = data.data[['realgdp', 'realcons']].pct_change().dropna()
print(data)
data.info()
gc_res = grangercausalitytests(data, 4, verbose=False)
for key in gc_res.keys():
    print(key)
    _F_test_ = gc_res[key][0]['params_ftest'][0]
    print(_F_test_)
#print(gc_res)
      realgdp  realcons
1    0.025256  0.015404
2   -0.001192  0.010440
3    0.003501  0.001085
4    0.022438  0.009580
5   -0.004674  0.012652
..        ...       ...
198 -0.006758 -0.008908
199 -0.013710 -0.007812
200 -0.016475  0.001512
201 -0.001850 -0.002193
202  0.006886  0.007291

[202 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 202 entries, 1 to 202
Data columns (total 2 columns):
realgdp     202 non-null float64
realcons    202 non-null float64
dtypes: float64(2)
memory usage: 4.7 KB
1
28.72480341484925
2
18.988046084107467
3
13.50147621775171
4
10.964552417824162
In [6]:
from statsmodels.tsa.stattools import grangercausalitytests

df_2 = df_2.dropna()


def F_Test(df_loop, columnName, columnTest='19'):
    lag = 109
    F_testt = []
    try:
        gc_res = grangercausalitytests(df_2[[columnTest, columnName]], lag, verbose=False)
        #print(gc_res)
        #print(len(gc_res))
        for key in gc_res.keys():            
            _F_test_ = gc_res[key][0]['params_ftest'][0]
            #_F_test_ = granger_test_result[key][0]['params_ftest'][0]
            #print(columnName + "  "+ str(key)+"  " +str(_F_test_))
            F_testt.append({ 'columnName': columnName, 'lag' : key, 'F_Test': _F_test_})
        return F_testt
    except:
        #print("failed granger test: ", columnName)
        pass
        
    
    
def loop_columns(df_loop, exclude, function):
    # loop over all comumns and exclude the given list of columns
    # df_loop:   df to loop over columns
    # exclude:   list of column names to exclude from loops
    # function:  function to call for each column
    # returns:   depends on function called
    
    F_tests = []
    for (columnName, columnData) in df_loop.iteritems():
        if not columnName in exclude:
            # Do s.th. with this column, e.g. call a function
            #print('Colunm Name : ', columnName)
            if function == 'F_Test':
                F_tests.append(F_Test(df_loop, columnName))                
    
    # remove none values
    #F_tests = [i for i in F_tests if i]     
    return F_tests

#df_2.dropna(0)
best_F = loop_columns(df_2[["total","01","02","03","04","05","06","07","08","09","10","11","12","13","14","15", "16"]], ["Day", "19"], 'F_Test')


# flatten list of lists
best_F = [item for sublist in best_F for item in sublist]

# sort by F-test
best_F = sorted(best_F, key=lambda k: k['F_Test'])

print("col-lag- F-test granger  (order by F-test)")
column_lag_ftest = []
for item in best_F:
    column_lag_ftest.append(str(item['columnName'])+" - "+str(item['lag'])+" - "+str(item['F_Test']))
    # print(str(item['columnName'])+" - "+str(item['lag'])+" - "+str(item['F_Test']))


# print the columns lags with the highest f-test value
for item in column_lag_ftest[-20:]:
    print(item)

#print(best_F)
col-lag- F-test granger  (order by F-test)
06 - 4 - 3.5570359440517585
06 - 6 - 3.6537347625538437
total - 3 - 3.7563307786502604
01 - 9 - 3.801916977359596
02 - 6 - 3.8963364482768896
01 - 7 - 3.975023911319591
04 - 4 - 4.069486818106526
02 - 5 - 4.2352711798437594
06 - 3 - 4.575068167640924
01 - 6 - 4.587045053572192
01 - 8 - 4.663992585914475
13 - 1 - 4.732402306486073
01 - 5 - 5.447422385782612
02 - 4 - 5.456481012188707
04 - 3 - 5.471581800297122
01 - 4 - 6.640496757734919
02 - 3 - 6.891906104871886
01 - 3 - 8.822732413838105
01 - 2 - 10.335321082273202
01 - 1 - 13.533916160077428

Plot data to understand

Plot basic stats

In [7]:
from matplotlib import pyplot
import seaborn as sns

# make extra window for plot
#%matplotlib qt
%matplotlib inline


plot_column_y = "19"
plot_column_x = "Day"
ax=df_2.plot(x=plot_column_x, y=plot_column_y, color="red", figsize=(20,15), grid=False)
df_2.plot(x=plot_column_x, y=[plot_column_y+'-min', plot_column_y+'-max'], ax=ax, color="grey")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-t-5' ], ax=ax, color="green")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-avg' ], ax=ax, color="blue")
#df_2.plot(title=plot_column_y+" avg, min, max")
#df_2.plot.close()
pyplot.show()

Compare datasets

In [8]:
# compare two datasets from the imported data:
%matplotlib inline

fig, ax1 = pyplot.subplots(figsize=(20,10))
# use twinx for 2 scales
ax2 = ax1#.twinx()

ax1.plot(df_2[plot_column_x], df_2[plot_column_y], 'g-')
ax2.plot(df_2[plot_column_x], df_2['01'], 'b-')
ax1.set_xlabel(plot_column_x)
ax1.set_ylabel(plot_column_y, color='g')
ax2.set_ylabel('01', color='b')
ax1.set(title=plot_column_y+" and 01" )

pyplot.show()

FFT

Here i try to find frequencies in the signal (not quite working).

In [9]:
import numpy as np
import matplotlib.pyplot as plt

#data = np.loadtxt("data.txt")
data = np.array([[0, 1.01],
                 [0.5, 1.04],
                 [1, 1.05],
                 [1.5, 1.24]])

t        = data[:,0]
signal_t = data[:,1]

dt = t[1]-t[0]

f        = np.fft.fftfreq(t.size, dt)
signal_f = np.fft.fft(signal_t)*dt

plt.figure()
plt.stem(np.fft.fftshift(f), np.fft.fftshift(np.abs(signal_f)))
plt.show()

print(df_2.index)
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/ipykernel_launcher.py:19: UserWarning: In Matplotlib 3.3 individual lines on a stem plot will be added as a LineCollection instead of individual lines. This significantly improves the performance of a stem plot. To remove this warning and switch to the new behaviour, set the "use_line_collection" keyword argument to True.
Int64Index([   4,    5,    6,    7,    8,    9,   10,   11,   12,   13,
            ...
            1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072],
           dtype='int64', length=1069)
In [10]:
t = np.array(list(range(1, 1070)))
#print(t)
signal_t = df_2['19'].values

dt = t[1]-t[0]

f        = np.fft.fftfreq(t.size, dt)
signal_f = np.fft.fft(signal_t)*dt

plt.figure(figsize=(25,4))
x = np.fft.fftshift(f)
y = np.fft.fftshift(np.abs(signal_f))
plt.xlim(-.001, .1)
plt.stem(x, y)
#axes.set_xlim([0,6])
plt.show()
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/ipykernel_launcher.py:14: UserWarning: In Matplotlib 3.3 individual lines on a stem plot will be added as a LineCollection instead of individual lines. This significantly improves the performance of a stem plot. To remove this warning and switch to the new behaviour, set the "use_line_collection" keyword argument to True.
  
In [11]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io.wavfile as wave

# rate,data = wave.read('57.wav')
data = df_2['19']
spectre = np.fft.fft(data)
freq = np.fft.fftfreq(data.size, 100)
mask=freq>0   
plt.plot(freq[mask],np.abs(spectre[mask]))#,  figsize=(25,4))
Out[11]:
[<matplotlib.lines.Line2D at 0x7eff6af38150>]
In [12]:
t = np.linspace(0, 2*np.pi, 1000, endpoint=True)
f = 3.0 # Frequency in Hz
A = 100.0 # Amplitude in Unit
s = A * np.sin(2*np.pi*f*t) # Signal



plt.plot(t,s)
plt.xlabel('Time ($s$)')
plt.ylabel('Amplitude ($Unit$)')
plt.show()

Y = np.fft.fft(s)

plt.plot(Y)
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/numpy/core/_asarray.py:85: ComplexWarning: Casting complex values to real discards the imaginary part
  return array(a, dtype, copy=False, order=order)
Out[12]:
[<matplotlib.lines.Line2D at 0x7eff6aeb5090>]

Difference and log Diff

In [13]:
# create a difference transform of the dataset
def difference(dataset):
	diff = list()
	for i in range(1, len(dataset)):
		value = dataset[i] - dataset[i - 1]
		diff.append(value)
	return np.array(diff)


# calc diff and ln diff
df_2[plot_column_y+'-ln'] = np.log(df_2[plot_column_y])
df_2[plot_column_y+'-ln'+'-diff'] = df_2[plot_column_y+'-ln'].dropna().diff()
df_2[plot_column_y+'-diff'] = df_2[plot_column_y].diff()
df_2.head()

# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))

# Levels
axes[0].plot(df_2.index._mpl_repr(),df_2[plot_column_y+'-diff'], '-')
axes[0].set(title='ln of '+plot_column_y)

# Log difference
axes[1].plot(df_2.index._mpl_repr(), df_2[plot_column_y+'-ln'+"-diff"], '-')
axes[1].hlines(0, df_2.index[0], df_2.index[-1], 'r')
axes[1].set(title='19 - difference of logs');
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/pandas/core/series.py:853: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)

Histogramm

In [14]:
# Graph data
#fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
#axes[0].plot(df_2['19-avg'], kind= 'kde')

df_2['19'].plot(kind= 'kde')#, figsize=(20,9)
pyplot.show()
df_2['19'].hist()
pyplot.show()
print('Histogramm shows that the values do not have a gaussian distribution.\n\n')
sns.boxplot( y=df_2['19']);
pyplot.show()
print('This plot draws a box around the 25th and 75th percentiles of the data that captures the middle 50% of observations.\n\
A line is drawn at the 50th percentile (the median) and whiskers are drawn above and below the box \n\
to summarize the general extents of the observations.\n\
Dots are drawn for outliers outside the whiskers or extents of the data.\n\n')
df_1['19'].describe()
Histogramm shows that the values do not have a gaussian distribution.


This plot draws a box around the 25th and 75th percentiles of the data that captures the middle 50% of observations.
A line is drawn at the 50th percentile (the median) and whiskers are drawn above and below the box 
to summarize the general extents of the observations.
Dots are drawn for outliers outside the whiskers or extents of the data.


Out[14]:
count    1073.000000
mean      117.153774
std       163.681308
min         0.000000
25%        47.000000
50%        78.000000
75%       131.000000
max      2166.000000
Name: 19, dtype: float64

Squareroot Transform

In [15]:
df_2['19-sqrt'] = np.sqrt(df_2['19'].values)
df_2['19-sqrt'].plot()
#df_2['19'].plot()
pyplot.show()
df_2['19-sqrt'].hist()
pyplot.show()

Log transform

In [16]:
df_2['19-log'] = np.log(df_2['19'].values)
df_2['19-log'].plot()
pyplot.show()

print(df_2.sort_values('19-log'))
df_2['19-log'].replace(np.inf, 0, inplace=True)
#pyplot.show()
df_2['19-log'].describe()
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/ipykernel_launcher.py:1: RuntimeWarning: divide by zero encountered in log
  """Entry point for launching an IPython kernel.
            Day  total    01    02    03    04   05    06   07   08  ...  \
1030 2019-10-31    125    12     5     8    44    0     0    0    5  ...   
1051 2019-11-23     32     0     0     0     0    0     0   20    0  ...   
964  2019-08-24    784    99    27   137   224  161    20   15    0  ...   
788  2019-02-28   1773   187   101   252   565  321    88   97   42  ...   
252  2017-09-10   1271   105    82   100   524   69    47   57   72  ...   
...         ...    ...   ...   ...   ...   ...  ...   ...  ...  ...  ...   
800  2019-03-12  22477  3687  1225  1593  7791  879   871  533  320  ...   
798  2019-03-10  14235  3816   454   842  4183  347   676  289  405  ...   
799  2019-03-11  24949  5079   831  1818  8024  914  1702  934  501  ...   
904  2019-06-24   9583  1196   928   372  1584  603   168  143  225  ...   
903  2019-06-23   8198  1655   569   143  1246  286    59   51  349  ...   

      month  day  year  day_diff  const     19-ln  19-ln-diff  19-diff  \
1030     10   31  2019      1033      1      -inf        -inf   -167.0   
1051     11   23  2019      1056      1      -inf        -inf    -65.0   
964       8   24  2019       965      1      -inf        -inf    -71.0   
788       2   28  2019       788      1      -inf        -inf    -18.0   
252       9   10  2017       252      1      -inf        -inf     -2.0   
...     ...  ...   ...       ...    ...       ...         ...      ...   
800       3   12  2019       800      1  7.145196   -0.401778   -627.0   
798       3   10  2019       798      1  7.424762    3.364319   1619.0   
799       3   11  2019       799      1  7.546974    0.122212    218.0   
904       6   24  2019       904      1  7.571474   -0.109164   -224.0   
903       6   23  2019       903      1  7.680637    3.446531   2097.0   

        19-sqrt    19-log  
1030   0.000000      -inf  
1051   0.000000      -inf  
964    0.000000      -inf  
788    0.000000      -inf  
252    0.000000      -inf  
...         ...       ...  
800   35.608988  7.145196  
798   40.951190  7.424762  
799   43.531598  7.546974  
904   44.068129  7.571474  
903   46.540305  7.680637  

[1069 rows x 118 columns]
Out[16]:
count    1069.000000
mean            -inf
std              NaN
min             -inf
25%         3.850148
50%         4.356709
75%         4.875197
max         7.680637
Name: 19-log, dtype: float64

boxcox transform

In [17]:
from scipy.stats import boxcox
df_2['19-avg-boxcox'], lam = boxcox(df_2['19-avg'].values)
print( ' Lambda: %f ' % lam)
df_2['19-avg-boxcox'].plot()
pyplot.show()
df_2['19-avg-boxcox'].hist()
pyplot.show()
 Lambda: -0.408395 

Autocorrelation and Partial Autocorrelation

In [18]:
import statsmodels.api as sm
# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
df_2.set_index('Day')
print(df_2.iloc[1:][plot_column_y+'-ln'+'-diff'])
#fig = sm.graphics.tsa.plot_acf(df_2[plot_column_y], lags=20, ax=axes[0])
#fig = sm.graphics.tsa.plot_pacf(df_2[plot_column_y+"-diff"], lags=150, ax=axes[1])

#print(df_2['19-avg-log'].dropna())

fig = sm.graphics.tsa.plot_acf(df_2['19'].dropna(0), lags=150, ax=axes[0])
fig = sm.graphics.tsa.plot_pacf(df_2['19'].dropna(0), lags=150, ax=axes[1])
5       1.102078
6      -0.572676
7      -0.571962
8      -0.629968
9       0.693147
          ...   
1068    0.609064
1069   -0.747214
1070   -0.523248
1071    1.571217
1072    0.731368
Name: 19-ln-diff, Length: 1068, dtype: float64

Autocorrelation

A correlation value calculated between two groups of numbers, such as observations and their lag=1 values, results in a number between -1 and 1. The sign of this number indicates a negative or positive correlation respectively. A value close to zero suggests a weak correlation, whereas a value closer to -1 or 1 indicates a strong correlation.

In [19]:
from pandas.plotting import autocorrelation_plot


#print(len(df_2['19'].dropna())) # 1073
#print(len(df_2['19'].dropna())) # 1069
#df_3 = df_2[['Day', '19', '19-avg']].dropna()
#print(df_3.iloc[110:120,0:8])
#fig, axes = pyplot.subplots(1, 2, figsize=(25,4))

#axes[0].plot(autocorrelation_plot(df_2['18']))
#axes[0]

ax = autocorrelation_plot(df_2['19-avg'].dropna())
ax.set_xlim([0,120])
ax.set_ylim([-0.25,0.75])
df_2['19-avg'].dropna().autocorr()
#autocorrelation_plot(df_2['19-avg'].dropna())
pyplot.show()

Dotted lines are provided that indicate any correlation values above those lines are statistically significant (meaningful). ma

Time Series Decomposition

In [20]:
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(df_2['19-avg'], model= 'multiplicative', freq=365)
result.plot()
pyplot.show()

Make SARIMAX model and fit parameters

In [21]:
# data to fit
series_2 = (np.asarray(df_2['01']).astype( 'float32' ) / np.asarray(df_2['01']).astype( 'float32' ).max())
series = (df_2['19'].dropna().astype('float64') / df_2['19'].dropna().astype('float64').max() )
print(series_2.max())
print(series.max())
series = [z for z in series]
#series_2 =  df_2[['day_of_week', 'const']]
1.0
1.0

Iterare over selected pdq and PDQM parameters.

Running the cell below might take a long time (days).

In [ ]:
import warnings
from math import sqrt
from statsmodels.tsa.statespace.sarimax import SARIMAX
#from statsmodels.tsa.sarimax_model import SARIMAX
from sklearn.metrics import mean_squared_error

import datetime

# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = 'â–ˆ', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

# evaluate an SARIMAX model for a given order (p,d,q) (PDQ M)
def evaluate_sarimax_model(X, X_2, arima_order, arima_seasonal_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    train_2, test_2 = X_2[0:train_size], X_2[train_size:]
    history = [x for x in train]
    history_2 = [z for z in train_2]
    endog=history
    exog=history_2
    #print(train)
    #print(".............")
    #print("train_2 len: " + str(len(train_2)))
    #print(train_2)
    #print("train len: " + str(len(train)))
    #print(train)
    #print("length test: %.i" %len(test))
    #print(test[709])
    #print("-------")
    
    # make predictions
    predictions = list()
    printProgressBar(0, len(test), prefix = 'Progress:', suffix = 'Complete', length = 80)
    for t in range(len(test)):
        #print("... testing %.i"%t)
        #print(arima_seasonal_order)
        printProgressBar(t, len(test), prefix = 'Progress:', suffix = 'Complete', length = 80)
        try:
            model = SARIMAX(endog, exog=exog, order=arima_order, seasonal_order=arima_seasonal_order)
            #model = SARIMAX(endog, order=arima_order, seasonal_order=arima_seasonal_order)            
            model_fit = model.fit(disp=0) #method='nm'
            #print("model_fit done")
            #yhat = model_fit.forecast(steps=1, exog=[[2,1]], dynamic= True)
            yhat = model_fit.forecast(steps=1, exog=[[history_2[-1]]], dynamic= True)[0]#steps=1, exog=[[2,1]], dynamic= True)
            #print("yhat: " + yhat[1])
            predictions.append(yhat)
            #print(test[t+709])
            history.append(test[t])
            history_2.append(test_2[t])
        except:
            print("... testing %.i"%t)
            print("... exception %.i"%t)
            e = sys.exc_info()[0]
            print(e)
            #continue
        
    # calculate out of sample error
    rmse = np.sqrt(mean_squared_error(test, predictions))
    print(rmse)
    return rmse

# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, dataset_2, my_order, my_seasonal_order, P_values, D_values, Q_values, M_values, p_values, d_values, q_values, searchmode):
    #dataset = dataset.astype( 'float32' )
    #dataset_2 = dataset_2.astype( 'float32' )
    best_score, best_cfg = float("inf"), None
    # Test data, variables in SARIMAX model without loop
    if searchmode==0:
        model = SARIMAX(series, exog=series_2, order=my_order, seasonal_order=my_seasonal_order)
        model_fit = model.fit(disp=True)
        print(model_fit.summary())
        fo =  model_fit.forecast(steps=1, exog=[[2]], dynamic= True)[0]
        print(fo)
        return model_fit
    i=1
    if searchmode==2 or searchmode==3: # 2 = take pdq and my_seasonal_order
        for p in p_values:
            for d in d_values:
                for q in q_values:
                    my_order = (p,d,q)
                    if searchmode==2:
                        try:
                            rmse = evaluate_sarimax_model(dataset, dataset_2, my_order, my_seasonal_order)
                            if rmse < best_score:
                                best_score, best_cfg = rmse, str(my_order)+str(my_seasonal_order)
                            print( '\nARIMA%s%s RMSE=%.3f  Round %s' % (my_order, my_seasonal_order,rmse, i))
                            currDate = datetime.datetime.now()
                            print(currDate)
                            i+=1
                        except:
                            print( '\nexcept ARIMA%s%s Round %s' % (my_order, my_seasonal_order, i))
                            continue
                    if searchmode==3:
                        print('if searchmode==3 is not implemented')
                        break
                        
    if searchmode==1:        
        for P in P_values:
            for D in D_values:
                for Q in Q_values:
                    for M in M_values:
                        my_seasonal_order = (P,D,Q, M)
                        try:
                            rmse = evaluate_sarimax_model(dataset, dataset_2, my_order, my_seasonal_order)
                            #rmse = evaluate_arima_model(dataset, order)
                            if rmse < best_score:
                                best_score, best_cfg = rmse, str(my_order)+str(my_seasonal_order)
                            print( '\nARIMA%s%s RMSE=%.3f  Round %s \n' % (my_order, my_seasonal_order,rmse, i))
                            currDate = datetime.datetime.now()
                            print(currDate)
                            i+=1
                        except:
                            print( '\nexcept ARIMA%s%s Round %s' % (my_order, my_seasonal_order, i))
                            continue
                    
    print( ' Best SARIMAX%s RMSE=%.3f ' % (best_cfg, best_score))

#my_order = (9, 1, 4) # good for 19-avg
my_order = (1, 0, 0) # good for 19
my_seasonal_order=(1, 1, 2, 2)
# evaluate parameters
p_values = [0, 1]
d_values = range(0, 1)
q_values = range(0, 1)

# evaluate parameters
P_values = [0, 1, 2]#[0, 1]#range(0, 3)
D_values = [0, 1, 2]#, 1]#range(0, 3)
Q_values = [0, 1, 3]#, 2]#range(0, 3)
M_values = range(0, 3)


#yhat = model_fit.predict(start=len(series), end=len(series))
#print("yhat 1: %.3f"%yhat)

# 0 = take my_order and my_seasonal_order # calc just one model
# 1 = take PDQM and my_order              # iterate just seasonality
# 2 = take pdq and my_seasonal_order      # iterate just arima 
# 3 = take PDQM and pdq                   # iterate all
searchmode = 1

warnings.filterwarnings("ignore")
evaluate_models(series, series_2, my_order, my_seasonal_order, P_values, D_values, Q_values, M_values, p_values, d_values, q_values, searchmode)
# my_order = (1,1,2)  my_seasonal_order=(1,1,1,12) rmse: 48.41061459833725  19-avg
#                                          exception at 118
#my_order = (1,1,2)  my_seasonal_order=(1,1,1,12) rmse: 48.41061459833725 rmse: 176.606  19
# Best SARIMAX(1,1,2)(1, 0, 1, 0) RMSE=174.665 
# ARIMA(1, 0, 1, 103) RMSE=175.186  Round 1
# ARIMA(1, 0, 1, 104) RMSE=177.115  Round 2
0.0803436494432014████████████████████████████████████████████████████████████████████████-| 99.7% Complete

ARIMA(1, 0, 0)(0, 0, 0, 0) RMSE=0.080  Round 1 

2020-05-18 05:11:53.766451
... testing 216███████████████████████████████████████████---------------------------------| 59.3% Complete
... exception 216

except ARIMA(1, 0, 0)(0, 0, 0, 1) Round 2
... testing 30███--------------------------------------------------------------------------| 8.2% Complete
... exception 30

except ARIMA(1, 0, 0)(0, 0, 0, 2) Round 2
Progress: |███████-------------------------------------------------------------------------| 9.3% Complete
In [22]:
print(0.07985695419329582*df_2['19'].dropna().astype('float64').max() )
172.97016278267873

I ran the model with different input parameters. Here are the more important results:

RIMA(0, 0, 0, 1) RMSE=35.855 Round 2 2020-01-26 02:22:05.357431 19-avg

Best SARIMAX(2, 0, 2, 0) RMSE=35.943 series_2 = np.asarray(df_2['12']).astype( 'float32' ) series = df_2['19-avg'].dropna().astype('float64')

ARIMA(2, 0, 2, 0) RMSE=175.109 Round 30 series_2 = np.asarray(df_2['03']).astype( 'float32' ) series = df_2['19'].dropna().astype('float64')

Best SARIMAX(1, 1, 2, 2) RMSE=172.336 series_2 = np.asarray(df_2['01']).astype( 'float32' ) series = df_2['19'].dropna().astype('float64')

Best SARIMAX(1, 0, 0)(0, 1, 1, 2) RMSE=0.080 series_2 = (np.asarray(df_2['01']).astype( 'float32' ) / np.asarray(df_2['01']).astype( 'float32' ).max()) series = (df_2['19'].dropna().astype('float64') / df_2['19'].dropna().astype('float64').max() ) print(0.07985695419329582np.asarray(df_2['01']).astype( 'float32' ).max()) = 405.5934703477495 print(0.07985695419329582df_2['19'].dropna().astype('float64').max() ) = 172.97016278267873

output of succesful run

series_2: 1073 series_2: 1073

model_fit.summary:

                                 Statespace Model Results                                
=========================================================================================
Dep. Variable:                                19   No. Observations:                 1073
Model:             SARIMAX(1, 1, 2)x(1, 1, 1, 4)   Log Likelihood               -6657.311
Date:                           Sat, 25 Jan 2020   AIC                          13330.622
Time:                                   19:51:22   BIC                          13370.410
Sample:                                        0   HQIC                         13345.696
                                      - 1073                                         
Covariance Type:                             opg                                         
===============================================================================
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
day_of_week    -6.9472      2.257     -3.078      0.002     -11.370      -2.524
const           0.0021   8820.484   2.38e-07      1.000   -1.73e+04    1.73e+04
ar.L1           0.6690      0.040     16.778      0.000       0.591       0.747
ma.L1          -0.9981      0.485     -2.057      0.040      -1.949      -0.047
ma.L2          -0.0019      0.039     -0.048      0.962      -0.078       0.075
ar.S.L4        -0.0163      0.052     -0.312      0.755      -0.119       0.086
ma.S.L4        -1.0000      4.274     -0.234      0.815      -9.376       7.376
sigma2        1.47e+04   5.94e+04      0.247      0.805   -1.02e+05    1.31e+05
===================================================================================
Ljung-Box (Q):                       35.69   Jarque-Bera (JB):            579232.48
Prob(Q):                              0.66   Prob(JB):                         0.00
Heteroskedasticity (H):               4.31   Skew:                             7.86
Prob(H) (two-sided):                  0.00   Kurtosis:                       116.00
===================================================================================

Warnings:    [1] Covariance matrix calculated using the outer product of gradients (complex-step).

Forecast:
1073    148.824086
1074    145.935440
dtype: float64

Save model

In [15]:
#from statsmodels.tsa.arima_model import AR
from statsmodels.tsa.ar_model import AR
import numpy as np

# create a difference transform of the dataset
def difference(dataset):
	diff = list()
	for i in range(1, len(dataset)):
		value = dataset[i] - dataset[i - 1]
		diff.append(value)
	return np.array(diff)

columnpredict = "19-avg"
series = df_2[columnpredict].astype('float64').dropna().values
model = AR(series)
model_fit = model.fit(maxlag=6, disp=False)

# should be the save an in DB query cell 1
country = "ETH"
# save model to file
model_fit.save('ar_model_'+country+'_'+columnpredict+'.pkl')
# save the differenced dataset
X = difference(series)
np.save('ar_data_'+country+'_'+columnpredict+'.npy', X)
# save the last ob
np.save('ar_obs_'+country+'_'+columnpredict+'.npy', [series[-1]])

Predict Value

In [24]:
from statsmodels.tsa.ar_model import ARResults
#from statsmodels.tsa.arima_model import ARIMAResults
country = 'ETH'
# load model
columnpredict = "19-avg"

model = ARResults.load('ar_model_'+country+'_'+columnpredict+'.pkl')
#model = ARResults.load('ar_model_'+country+'_'+columnpredict+'.pkl')
data = np.load('ar_data_'+country+'_'+columnpredict+'.npy')
last_ob = np.load('ar_obs_'+country+'_'+columnpredict+'.npy')

print(len(data))
print(data)
print(last_ob)

# make prediction
predictions = model.predict(start=len(data), end=len(data))
print('Predicted difference: %f' % predictions)
# transform prediction
yhat = predictions[0] + last_ob[0]
print('Prediction: %f' % yhat)
1068
[48.  18.   0.4 ... -8.4 -0.8 25.8]
[67.4]
Predicted difference: 46.623135
Prediction: 114.023135

Plot Error and Prediction

In [26]:
from math import sqrt
from sklearn.metrics import mean_squared_error

# add a column with the future value
df_2[columnpredict+"+t1"] = df_2[columnpredict].shift(-1)
X = df_2[[columnpredict, columnpredict+"+t1"]].dropna().values

# split into train and test sets
train_size = int(len(X) * 0.66)
train, test = X[1:train_size], X[train_size:]

train_X, train_y = train[:,0], train[:,1]
test_X, test_y = test[:,0], test[:,1]


# persistence model
def model_persistence(x):
    return x

# trained model
predictions_model = list()
def model_predict(x):
    predictions = model.predict(x, x)
    yhat = predictions[0] + x #last_ob[0]
    predictions_model    
    return predictions

# walk-forward validation
predictions_persistence = list()
predictions_model = list()
i=0
for x in test_X:
    #print(x)
    yhat_persistence = model_persistence(x)
    predictions_persistence.append(yhat_persistence)
    
    yhat_model = model_predict(i+train_size)
    predictions_model.append(yhat_model)
    i+=1
    

rmse_persistence = sqrt(mean_squared_error(test_y, predictions_persistence))
rmse_model = sqrt(mean_squared_error(test_y, predictions_model))
print( ' Test RMSE persistence: %.3f ' % rmse_persistence)
print( ' Test RMSE model: %.3f ' % rmse_model)
%matplotlib inline

pyplot.plot(train_y)
pyplot.plot([None for i in train_y] + [x for x in test_y], 'g-', label='test_y')
pyplot.plot([None for i in train_y] + [x for x in predictions_persistence], 'r-', label='prediction_pers')
pyplot.plot([None for i in train_y] + [x for x in predictions_model], 'y-', label='prediction_model')
pyplot.show()
 Test RMSE persistence: 62.879 
 Test RMSE model: 93.408 
In [27]:
import statsmodels.api as sm

fig, ax = pyplot.subplots()
ax = df_2[columnpredict].plot(ax=ax)
fig = model.plot_predict(len(data),len(data)+20, dynamic=True, ax=ax, plot_insample=False)
pyplot.show()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-27-dca8729f4494> in <module>
      3 fig, ax = pyplot.subplots()
      4 ax = df_2[columnpredict].plot(ax=ax)
----> 5 fig = model.plot_predict(len(data),len(data)+20, dynamic=True, ax=ax, plot_insample=False)
      6 pyplot.show()

~/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/statsmodels/base/wrapper.py in __getattribute__(self, attr)
     33             pass
     34 
---> 35         obj = getattr(results, attr)
     36         data = results.model.data
     37         how = self._wrap_attrs.get(attr)

AttributeError: 'ARResults' object has no attribute 'plot_predict'
In [ ]: