-
Notifications
You must be signed in to change notification settings - Fork 0
/
predictor.py
149 lines (115 loc) · 4.66 KB
/
predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# coding: utf-8
# # Time series predictor
import numpy as np
import time
from datetime import date
import datetime
from datetime import timedelta
import csv
import holidays # for importing the public holidays
import re
import torch
from src.utils import *
from src.data_miner import DataMiner
from src.models import MLP
from src.dataset import GoodNightDataset
# ## Parameters
num_features = 5
min_hour = 21 # Minimum hour for sleep detection
max_hour = 5 # Maximum hour for sleep detection
train_window = 3 # Sequence length of past days
local_holidays = holidays.Italy(prov='BO') # Get the holidays in Bologna, Italy :)
EPOCHS = 500
batch_size = 16
# Directories
data_dir = "data"
data_file = "data/LastSeenDataset.csv"
# - Feature extraction: we first extract the features given the time series data of Telegram accesses.
# - Supposition: last Telegram access in very similar to the time the person goes to sleep
# ## Open Data File
with open(data_file, newline='') as csvfile:
date_list = list(csv.reader(csvfile))
date_list = convert_to_dates(date_list)
'''Test data: search calendar for local holidays'''
print("First day is holiday: ", date_list[0][0] in local_holidays)
# ## Feature engineering
# Possible features to extract:
# 1. Last seen time (arguably the most important)
# 2. Wake up time
# 3. Number of Telegram accesses during the previous day
# 4. Day of the week
# 5. Public holiday presence in the following day (using the holidays library)
# 6. (time spent on Telegram)
#
data_tensor = DataMiner(date_list).to_tensor(verbose=False)
n_features = num_features # this is number of parallel inputs
n_timesteps = train_window # this is number of timesteps
# ## Model
# Since we want to predict simple time series data, we can employ:
# - MPL: Multi Layer Perceptron, simple deep neural network with hidden layer
# - RNN: Recurrent Neural Network, more suitable for time series
# - LSTM: Long Short Term Memory, an advancement of RNN
# - Transformer: currently (2020) state of the art, but complex and possibly overpower
# - ... Other
model = MLP(n_features*n_timesteps, 1)
criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# ## Build the Dataset
dataset = GoodNightDataset(data_file, n_timesteps)
print('Dataset length: ', len(dataset))
# ## Data Augmentation
# Given that the training data is not much, we can insert some noise to augment it; this will also make the model less prone to overfitting
dataset.noisy() # apply gaussian noise
# ## Training the Model
trainloader = torch.utils.data.DataLoader(dataset,
batch_size=batch_size, shuffle=True,
num_workers=0)
model.train()
losses = []
# Training loop
print('\nStarting training loop...')
for t in range(EPOCHS):
X, y = next(iter(trainloader))
optimizer.zero_grad()
prediction = model.forward(X.reshape(batch_size, n_features*n_timesteps))
loss = criterion(prediction, y)
loss.backward()
optimizer.step()
losses.append(loss.item())
if t%10 == 0 and t >= 10:
print(('Epoch: {:4} | Total mean loss: {:.6f} ').format(t, mean(losses[t-10:t])))
# ### Model evaluation on training data
# We use the trained model to predict the same data as before, this time with no noise.
# Notice that we are going to overfit if we train for too long
# Potential fixes:
# - Use validation loss
# - Use higher noise value
# - Use different noise generator
# - Find another way to augment the data
# - Collect more data
print('\nTest on training data: ')
print('(Better to test on validation dataset when you have enough data)')
with torch.no_grad():
for i in range(len(dataset)):
X, y = dataset[i]
prediction = model.forward(X.reshape(1,15)).item()
real = y.T.item()
print('Predicted: {:.4f} | Real: {:.4f}'.format(prediction, real))
# ## Saving the time
# We save the predicted time to send the message in a file, so that the Daemon can handle it
now = datetime.datetime.now()
x = dataset.get_latest_sequence()
with torch.no_grad():
p = model.forward(x.reshape(1,15)).item()
print(p)
p_sec = int(p*(max_hour+24-min_hour)*3600)
prediction = now.replace(hour=min_hour, minute=0, second=0) + timedelta(seconds=p_sec)
print('\nExpected time to go to sleep: ', prediction.strftime("%Y-%m-%d %H:%M:%S"))
'''Write the value on a text file to be read by the Daemon'''
with open ('data/prediction.txt','w') as z:
z.write(prediction.strftime("%Y-%m-%d %H:%M:%S\n"))
z.close()
with open ('data/prediction_list.txt','a') as z:
z.write(prediction.strftime("%Y-%m-%d %H:%M:%S\n"))
z.close()