Events country ETH time series forecasting¶

Get data from Database¶

The csv data is based on the following mysql query on the gdelt_events_country_ETH table.

The gdelt_events_country_ETH table is the fraction of the gdelt events which contain records associated with Ethiopia.

import numpy as np # load prev. data for prediction
import os # del files, sys exit

import pandas as pd

# get fresh data from database:
import mysql.connector
from mysql.connector import Error

def execute_query(query, cnx):
    cursor = cnx.cursor(raw=True, buffered=True)
    try:
        cursor.execute(query)
        result = cursor.fetchall()
        cursor.close()
    except Error as e:
        print("sql query err")
        #print(e)
        errorstring = format(str(e.args[0]))
        errornumber = errorstring[:4]
        print(errornumber)
        print(errorstring)

    #result = "test"
    return result

if 1 == 2:
    mysql_config = {
            'user': 'DB-user',
            'password': 'DB-password',
            'host': 'xxx.xxx.xxx.xxx',
            'database': 'DB-name',
            'raise_on_warnings': True,
            'port': 'DB-port',
        }

if 1 == 1:
    mysql_config = {
            'user': 'DB-user',
            'password': 'DB-password',
            'host': '127.0.0.1',
            'database': 'DB-name',
            'raise_on_warnings': True,
            'port': '3306',
            'ssl_disabled': True
        }

try:    
    cnx = mysql.connector.connect(**mysql_config)
    if cnx.is_connected():
        #cursor = cnx.cursor(raw=True, buffered=True)
        print('connected to datacoll 2')  #

except Error as e:
    print("mysql DB connection error")
    #print("filename: " + filename + " row: " + str(row))
    print(e)
    print( "exception main: !! connection error in delete old")
    print(format(str(e.args[0])))
    os.sys.exit(1)



# print(db)
# the model will be for this country:
# CHE AUT DEU ER ETH SDN SYR USA RW
country = "ETH"

query_general = " select Day\
    , sum(NumArticles)  as 'total'\
    , sum( if(EventRootCode = '01', NumArticles, 0) ) as '01'\
	, sum( if(EventRootCode = '02', NumArticles, 0) ) as '02'\
    , sum( if(EventRootCode = '03', NumArticles, 0) ) as '03'\
	, sum( if(EventRootCode = '04', NumArticles, 0) ) as '04'\
	, sum( if(EventRootCode = '05', NumArticles, 0) ) as '05'\
	, sum( if(EventRootCode = '06', NumArticles, 0) ) as '06'\
	, sum( if(EventRootCode = '07', NumArticles, 0) ) as '07'\
	, sum( if(EventRootCode = '08', NumArticles, 0) ) as '08'\
	, sum( if(EventRootCode = '09', NumArticles, 0) ) as '09'\
	, sum( if(EventRootCode = '10', NumArticles, 0) ) as '10'\
	, sum( if(EventRootCode = '11', NumArticles, 0) ) as '11'\
	, sum( if(EventRootCode = '12', NumArticles, 0) ) as '12'\
	, sum( if(EventRootCode = '13', NumArticles, 0) ) as '13'\
	, sum( if(EventRootCode = '14', NumArticles, 0) ) as '14'\
	, sum( if(EventRootCode = '15', NumArticles, 0) ) as '15'\
	, sum( if(EventRootCode = '16', NumArticles, 0) ) as '16'\
	, sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
	, sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
	, sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
    , sum( if(EventRootCode = '20', NumArticles, 0) ) as '20'\
    from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"

query_19 = " select Day\
    , sum(NumArticles)  as 'total'\
    , sum( if(EventRootCode = '17', NumArticles, 0) ) as '17'\
    , sum( if(EventRootCode = '18', NumArticles, 0) ) as '18'\
    , sum( if(EventRootCode = '19', NumArticles, 0) ) as '19'\
    from gdelt_events_country_"+country+" where left(Day,4) > 2016 group by Day;"

columns_all = ["Day", "total", "17", "18", "19"]
#"select * from gdelt_events limit 2;"
result = execute_query(query_19, cnx)
#print(result)
column_day = []
column_list = []

df_1 = pd.DataFrame( columns=columns_all)
df_1 = df_1.fillna(0) # with 0s rather than NaNs

for col in range(len(columns_all)):    
    for item in result:        
        column_list.append(bytes(item[col]).decode())
        
    df_1[columns_all[col]] = column_list    
    column_list = []


#df_1 = pd.DataFrame(column_day, columns = ['Day'])
#df_1['19'] = column_list
print(df_1.head())
#print(df_1.dtypes)
df_1.to_csv(r'ar_data_'+country+'.csv')
#np.save('ar_data_'+country+'.npy', column_list)

### Load data and convert date

connected to datacoll 2
        Day total   17  18  19
0  20170101   679   77  15  49
1  20170102  1670   54  12  73
2  20170103  1659  110   0  90
3  20170104  2038  132  10  32
4  20170105  1570  125  15  96

Get data from csv file¶

Import cvs data as queried above. This is faster than database query when runnung it multiple times.

import pandas as pd
import numpy as np

#csvfilename = 'events_ETH_Day_NumArt_sumRootCode.csv'
csvfilename = 'ar_data_ETH.csv'

# filepath relative to jupyter notebook
csvfilepath = '../data/'
csvfilepath = ''

df_1 = pd.read_csv(csvfilepath + csvfilename                   
                   ,delimiter=','
                   #,skipinitialspace = True
                   #,encoding='latin_1'
                   ,header=0
                   ,index_col=False 
                  )

# change format of datetime
df_1['Day'] = pd.to_datetime(df_1['Day'], format='%Y%m%d')
df_1 = df_1.drop(['Unnamed: 0'], axis=1)
df_1['day_of_week'] = df_1['Day'].dt.dayofweek
df_1['week'] = df_1['Day'].dt.week
print(df_1.dtypes)
print(df_1.head(9))
listofcolumns = df_1.columns
print(listofcolumns)
#print("shape: ", df_1.shape)
#df_1.plot(subplots=True, figsize=(20,25), grid=True)
df_1['19'].plot(subplots=True, figsize=(20,15), grid=True, style= 'k.')

Day            datetime64[ns]
total                   int64
17                      int64
18                      int64
19                      int64
day_of_week             int64
week                    int64
dtype: object
         Day  total   17  18   19  day_of_week  week
0 2017-01-01    679   77  15   49            6    52
1 2017-01-02   1670   54  12   73            0     1
2 2017-01-03   1659  110   0   90            1     1
3 2017-01-04   2038  132  10   32            2     1
4 2017-01-05   1570  125  15   96            3     1
5 2017-01-06   2806  121  30  289            4     1
6 2017-01-07   1523    8  10  163            5     1
7 2017-01-08   1099    9  11   92            6     1
8 2017-01-09   1588   34  26   49            0     2
Index(['Day', 'total', '17', '18', '19', 'day_of_week', 'week'], dtype='object')

array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fdb90532910>],
      dtype=object)

Add basic statistic properties¶

def add_stats(df_stats, columnName):
    # add rolling mean, min, max to each column of a given dataframe    
    
    # create window for calculating avg.
    wwidth = 5 # window width
    window2 = df_stats[columnName].rolling(window=wwidth) # shifted
    # add statistical data
    df_stats[columnName+'-avg'] = window2.mean()
    df_stats[columnName+'-min'] = window2.min()
    df_stats[columnName+'-max'] = window2.max()

    columnNameShift=columnName+'-t-'+str(wwidth)
    shifted = df_stats[columnName].shift(wwidth-1)
    df_stats[columnNameShift]=df_stats[columnName].shift(wwidth-1)
    # testing expanding window
    window = df_stats[columnName].expanding()
    return df_stats


def loop_columns(df_loop, exclude=[]):
    # loop over all comumns and exclude the given list of columns
    # df_loop: df to loop over columns
    # exclude: list of column names to exclude from loops
    # returns a df (which might be altered)
    
    for (columnName, columnData) in df_loop.iteritems():
        if not columnName in exclude:
            # Do s.th. with this column, e.g. call a function
            #print('Colunm Name : ', columnName)
            df_loop = add_stats(df_loop, columnName)
    
    #df = None 
    return df_loop


df_2 = df_1.copy()
df_2 = loop_columns(df_2, ["Day"])

df_2['month'] = [df_2.Day[i].month for i in range(len(df_2))]
df_2['day'] = [df_2.Day[i].day for i in range(len(df_2))]
df_2['year'] = [df_2.Day[i].year for i in range(len(df_2))]
df_2['day_diff'] = [(df_2.Day[i] - df_2.Day[0]).days for i in range(len(df_2))]
df_2['const']=1

# look at specific columns, everything's right?
print(df_2.iloc[:,:30].head())
#print(df_2.dtypes)
#print(df_2.columns)

# delete old df to free memory:
del df_1

# show python vars in buffer:
#%whos

         Day  total   17  18  19  day_of_week  week  total-avg  total-min  \
0 2017-01-01    679   77  15  49            6    52        NaN        NaN   
1 2017-01-02   1670   54  12  73            0     1        NaN        NaN   
2 2017-01-03   1659  110   0  90            1     1        NaN        NaN   
3 2017-01-04   2038  132  10  32            2     1        NaN        NaN   
4 2017-01-05   1570  125  15  96            3     1     1523.2      679.0   

   total-max  ...  19-min  19-max  19-t-5  day_of_week-avg  day_of_week-min  \
0        NaN  ...     NaN     NaN     NaN              NaN              NaN   
1        NaN  ...     NaN     NaN     NaN              NaN              NaN   
2        NaN  ...     NaN     NaN     NaN              NaN              NaN   
3        NaN  ...     NaN     NaN     NaN              NaN              NaN   
4     2038.0  ...    32.0    96.0    49.0              2.4              0.0   

   day_of_week-max  day_of_week-t-5  week-avg  week-min  week-max  
0              NaN              NaN       NaN       NaN       NaN  
1              NaN              NaN       NaN       NaN       NaN  
2              NaN              NaN       NaN       NaN       NaN  
3              NaN              NaN       NaN       NaN       NaN  
4              6.0              6.0      11.2       1.0      52.0  

[5 rows x 30 columns]

Plot data to understand¶

Plot basic stats¶

from matplotlib import pyplot
import seaborn as sns

# make extra window for plot
#%matplotlib qt
%matplotlib inline


plot_column_y = "19"
plot_column_x = "Day"
ax=df_2.plot(x=plot_column_x, y=plot_column_y, color="red", figsize=(20,15), grid=False)
df_2.plot(x=plot_column_x, y=[plot_column_y+'-min', plot_column_y+'-max'], ax=ax, color="grey")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-t-5' ], ax=ax, color="green")
df_2.plot(x=plot_column_x, y=[plot_column_y+'-avg' ], ax=ax, color="blue")
#df_2.plot(title=plot_column_y+" avg, min, max")
#df_2.plot.close()
pyplot.show()

Compare datasets¶

# compare two datasets from the imported data:
%matplotlib inline

fig, ax1 = pyplot.subplots(figsize=(20,10))
# use twinx for 2 scales
ax2 = ax1#.twinx()

ax1.plot(df_2[plot_column_x], df_2[plot_column_y], 'g-')
ax2.plot(df_2[plot_column_x], df_2['total'], 'b-')
ax1.set_xlabel(plot_column_x)
ax1.set_ylabel(plot_column_y, color='g')
ax2.set_ylabel('total', color='b')
ax1.set(title=plot_column_y+" and total" )

pyplot.show()

Difference and log Diff¶

# create a difference transform of the dataset
def difference(dataset):
	diff = list()
	for i in range(1, len(dataset)):
		value = dataset[i] - dataset[i - 1]
		diff.append(value)
	return np.array(diff)


# calc diff and ln diff
df_2[plot_column_y+'-ln'] = np.log(df_2[plot_column_y])
df_2[plot_column_y+'-ln'+'-diff'] = df_2[plot_column_y+'-ln'].dropna().diff()
df_2[plot_column_y+'-diff'] = df_2[plot_column_y].diff()
df_2.head()

# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))

# Levels
axes[0].plot(df_2.index._mpl_repr(),df_2[plot_column_y+'-diff'], '-')
axes[0].set(title='ln of '+plot_column_y)

# Log difference
axes[1].plot(df_2.index._mpl_repr(), df_2[plot_column_y+'-ln'+"-diff"], '-')
axes[1].hlines(0, df_2.index[0], df_2.index[-1], 'r')
axes[1].set(title='19 - difference of logs');

/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/pandas/core/series.py:853: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)

Histogramm¶

# plot with various axes scales
pyplot.figure(figsize=(20,9))

n_bins = 50
pyplot.subplot(221)
pyplot.plot(df_2['19-avg'] )
pyplot.yscale('linear')
pyplot.title('19 avg linear')
pyplot.grid(True)

pyplot.subplot(222)
pyplot.hist(df_2['19'],bins=n_bins )
pyplot.yscale('linear')
pyplot.title('19 linear hist')
pyplot.grid(True)

pyplot.subplot(223)
pyplot.plot(df_2['19-avg'])
pyplot.yscale('log')
pyplot.title('19 avg log')
pyplot.grid(True)

pyplot.subplot(224)
pyplot.hist(df_2['19'],bins=n_bins )
pyplot.yscale('log')
pyplot.title('19 log hist')
pyplot.grid(True)

pyplot.show()

print('Histogramms shows that the values do not have a gaussian distribution.\n\n')
sns.boxplot( y=df_2['19-avg']);
pyplot.show()

Histogramms shows that the values do not have a gaussian distribution.

This plot draws a box around the 25th and 75th percentiles of the data that captures the middle 50% of observations. A line is drawn at the 50th percentile (the median) and whiskers are drawn above and below the box to summarize the general extents of the observations. Dots are drawn for outliers outside the whiskers or extents of the data.

Square root transform¶

pyplot.figure(figsize=(20,9))

n_bins = 50
pyplot.subplot(221)
pyplot.plot(np.sqrt(df_2['19-avg'].values))
pyplot.yscale('linear')
pyplot.title('19 avg sqrt')
pyplot.grid(True)

pyplot.subplot(222)
pyplot.hist(np.sqrt(df_2['19-avg'].values), bins=n_bins )
pyplot.yscale('linear')
pyplot.title('19 avg sqrt')
pyplot.grid(True)

pyplot.show()

/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/numpy/lib/histograms.py:839: RuntimeWarning: invalid value encountered in greater_equal
  keep = (tmp_a >= first_edge)
/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/numpy/lib/histograms.py:840: RuntimeWarning: invalid value encountered in less_equal
  keep &= (tmp_a <= last_edge)

boxcox transform¶

from scipy.stats import boxcox

df_2['19-avg-boxcox'], lam = boxcox(df_2['19-avg'].values)
print( ' Lambda: %f ' % lam)

pyplot.figure(figsize=(20,9))

pyplot.subplot(221)
pyplot.plot(df_2['19-avg-boxcox'].values)
pyplot.yscale('linear')
pyplot.title('19 avg boxcox')
pyplot.grid(True)

pyplot.subplot(222)
pyplot.hist(df_2['19-avg-boxcox'].values, bins=50)
pyplot.yscale('linear')
pyplot.title('19 avg boxcox hist')
pyplot.grid(True)

pyplot.show()

/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/scipy/stats/morestats.py:1038: RuntimeWarning: invalid value encountered in less_equal
  if any(x <= 0):

 Lambda: 8.472136

Autocorrelation (acf) and partial Autocorrelation (pacf)¶

import statsmodels.api as sm
# Graph data
fig, axes = pyplot.subplots(1, 2, figsize=(25,4))
df_2.set_index('Day')
print(df_2.iloc[1:][plot_column_y+'-ln'+'-diff'])
#fig = sm.graphics.tsa.plot_acf(df_2[plot_column_y], lags=20, ax=axes[0])
#fig = sm.graphics.tsa.plot_pacf(df_2[plot_column_y+"-diff"], lags=150, ax=axes[1])

#print(df_2['19-avg-log'].dropna())

fig = sm.graphics.tsa.plot_acf(df_2['19-avg'].dropna(), lags=150, ax=axes[0])
fig = sm.graphics.tsa.plot_pacf(df_2['19-avg'].dropna(), lags=150, ax=axes[1])

1       0.398639
2       0.209350
3      -1.034074
4       1.098612
5       1.102078
          ...   
1068    0.609064
1069   -0.747214
1070   -0.523248
1071    1.571217
1072    0.731368
Name: 19-ln-diff, Length: 1072, dtype: float64

/home/ralf/Dokumente/computer/coding/python/lectures/lecture-env1/lib/python3.7/site-packages/statsmodels/regression/linear_model.py:1358: RuntimeWarning: invalid value encountered in sqrt
  return rho, np.sqrt(sigmasq)

Autocorrelation¶

A correlation value calculated between two groups of numbers, such as observations and their lag=1 values, results in a number between -1 and 1. The sign of this number indicates a negative or positive correlation respectively. A value close to zero suggests a weak correlation, whereas a value closer to -1 or 1 indicates a strong correlation.

from pandas.plotting import autocorrelation_plot


#print(len(df_2['19'].dropna())) # 1073
#print(len(df_2['19'].dropna())) # 1069
#df_3 = df_2[['Day', '19', '19-avg']].dropna()
#print(df_3.iloc[110:120,0:8])
#fig, axes = pyplot.subplots(1, 2, figsize=(25,4))

#axes[0].plot(autocorrelation_plot(df_2['18']))
#axes[0]
pyplot.figure(figsize=(15, 6))
ax = autocorrelation_plot(df_2['19-avg'].dropna())
ax.set_xlim([0,120])
ax.set_ylim([-0.25,0.75])
df_2['19-avg'].dropna().autocorr()
#autocorrelation_plot(df_2['19-avg'].dropna())
pyplot.show()

Dotted lines are provided that indicate any correlation values above those lines are statistically significant (meaningful). ma

Time Series Decomposition¶

from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(df_2['19-avg'].dropna(), model= 'multiplicative', freq=365)
result.plot()
pyplot.show()

Make ARIMA model and fit parameters¶

(takes 10 h)

ARIMA grid search¶

import warnings
from math import sqrt

from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error

# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()
        

# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    
    # make predictions
    predictions = list()
    printProgressBar(0, len(test), prefix = 'Progress:', suffix = 'Complete', length = 80)
    for t in range(len(test)):
        printProgressBar(t, len(test), prefix = 'Progress:', suffix = 'Complete', length = 80)
        model = ARIMA(endog=history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
        
    # calculate out of sample error
    rmse = sqrt(mean_squared_error(test, predictions))
    return rmse


# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    #dataset = dataset.astype( 'float32' )
    best_score, best_cfg = float("inf"), None
    i=1
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    print(order)
                    rmse = evaluate_arima_model(dataset, order)
                    if rmse < best_score:
                        best_score, best_cfg = rmse, order
                    print( 'ARIMA%s RMSE=%.3f  Round %s' % (order,rmse, i))
                    i+=1
                except:
                    continue
                    
    print( ' Best ARIMA%s RMSE=%.3f ' % (best_cfg, best_score))

# load dataset
def parser(x):
    return datetime.strptime( '190' +x, '%Y-%m' )


# data to fit
#columnpredict = "19-avg-log"
columnpredict = "19-avg"
series = df_2[columnpredict].astype('float64').dropna().values

# evaluate parameters
p_values = [9]# range(3, 4)
d_values = [1]#range(0, 3)
q_values = [4]#range(0, 4)
warnings.filterwarnings("ignore")
evaluate_models(series, p_values, d_values, q_values)

(9, 1, 4)
ARIMA(9, 1, 4) RMSE=35.904  Round 1███████████████████████████████████████████████████████-| 99.7% Complete
 Best ARIMA(9, 1, 4) RMSE=35.904

Best ARIMA(1, 0, 0) RMSE=173.122  for 19  ETH
Best ARIMA(2, 1, 2) RMSE=47.045   for 19-avg ETH
Best ARIMA(1, 1, 2) RMSE=422.358  for 19-avg ETH all data 
Best ARIMA(4, 1, 2) RMSE=41.080   for 19-avg ETH all data
Best ARIMA(6, 1, 3) RMSE=39.331   for 19-avg ETH
Best ARIMA(7, 1, 3) RMSE=38.876  
Best ARIMA(7, 1, 4) RMSE=36.079   for 19-avg ETH
Best ARIMA(9, 1, 4) RMSE=35.904    for 19-avg ETH
Best ARIMA(3, 0, 2) RMSE=0.252     for df_2['19-avg-log']

Save model¶

#from statsmodels.tsa.arima_model import AR
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import mean_squared_error
import numpy as np

# create a difference transform of the dataset
def difference(dataset):
	diff = list()
	for i in range(1, len(dataset)):
		value = dataset[i] - dataset[i - 1]
		diff.append(value)
	return np.array(diff)

columnpredict = "19-avg"
series = df_2[columnpredict].astype('float64').dropna().values
model = AR(series)
model_fit = model.fit(maxlag=6, disp=False)

# should be the save an in DB query cell 1
country = "ETH"
# save model to file
model_fit.save('ar_model_'+country+'_'+columnpredict+'.pkl')
# save the differenced dataset
X = difference(series)
np.save('ar_data_'+country+'_'+columnpredict+'.npy', X)
# save the last ob
np.save('ar_obs_'+country+'_'+columnpredict+'.npy', [series[-1]])

Predict Value¶

from statsmodels.tsa.ar_model import ARResults
#from statsmodels.tsa.arima_model import ARIMAResults
country = 'ETH'
# load model

model = ARResults.load('ar_model_'+country+'_'+columnpredict+'.pkl')
#model = ARResults.load('ar_model_'+country+'_'+columnpredict+'.pkl')
data = np.load('ar_data_'+country+'_'+columnpredict+'.npy')
last_ob = np.load('ar_obs_'+country+'_'+columnpredict+'.npy')

print(len(data))
print(data)
print(last_ob)

# make prediction
predictions = model.predict(start=len(data), end=len(data))
print('Predicted difference: %f' % predictions)
# transform prediction
yhat = predictions[0] + last_ob[0]
print('Prediction: %f' % yhat)

1068
[48.  18.   0.4 ... -8.4 -0.8 25.8]
[67.4]
Predicted difference: 46.623135
Prediction: 114.023135

Plot Error and Prediction¶

from math import sqrt

# add a column with the future value
df_2[columnpredict+"+t1"] = df_2[columnpredict].shift(-1)
X = df_2[[columnpredict, columnpredict+"+t1"]].dropna().values

# split into train and test sets
train_size = int(len(X) * 0.66)
train, test = X[1:train_size], X[train_size:]

train_X, train_y = train[:,0], train[:,1]
test_X, test_y = test[:,0], test[:,1]


# persistence model
def model_persistence(x):
    return x

# trained model
predictions_model = list()
def model_predict(x):
    predictions = model.predict(x, x)
    yhat = predictions[0] + x #last_ob[0]
    predictions_model    
    return predictions

# walk-forward validation
predictions_persistence = list()
predictions_model = list()
i=0
for x in test_X:
    #print(x)
    yhat_persistence = model_persistence(x)
    predictions_persistence.append(yhat_persistence)
    
    yhat_model = model_predict(i+train_size)
    predictions_model.append(yhat_model)
    i+=1
    

rmse_persistence = sqrt(mean_squared_error(test_y, predictions_persistence))
rmse_model = sqrt(mean_squared_error(test_y, predictions_model))
print( ' Test RMSE persistence: %.3f ' % rmse_persistence)
print( ' Test RMSE model: %.3f ' % rmse_model)
%matplotlib inline

pyplot.figure(figsize=(18, 6))
pyplot.plot(train_y)
pyplot.plot([None for i in train_y] + [x for x in test_y], 'g-', label='test_y')
pyplot.plot([None for i in train_y] + [x for x in predictions_persistence], 'r-', label='prediction_pers')
pyplot.plot([None for i in train_y] + [x for x in predictions_model], 'y-', label='prediction_model')
pyplot.show()

 Test RMSE persistence: 62.879 
 Test RMSE model: 93.408

import statsmodels.api as sm
import statsmodels

fig, ax = pyplot.subplots(figsize=(15, 6))
ax = df_2[columnpredict].plot(ax=ax)
fig = model.plot_predict(len(data),len(data)+20, dynamic=True, ax=ax, plot_insample=False)
pyplot.show()