So I was trying to convert my data's timestamps from Unix timestamps to a more readable date format. I created a simple Java program to do so and write to a .csv file, and that went smoothly. I tried using it for my model by one-hot encoding it into numbers and then turning everything into normalized data. However, after my attempt to one-hot encode (which I am not sure if it even worked), my normalization process using make_column_transformer failed.
# model 4
# next model
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tensorflow.keras import layers
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
np.set_printoptions(precision=3, suppress=True)
btc_data = pd.read_csv(
"/content/drive/MyDrive/Science Fair/output2.csv",
names=["Time", "Open"])
X_btc = btc_data[["Time"]]
y_btc = btc_data["Open"]
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(X_btc)
X_btc = enc.transform(X_btc)
print(X_btc)
X_train, X_test, y_train, y_test = train_test_split(X_btc, y_btc, test_size=0.2, random_state=62)
ct = make_column_transformer(
(MinMaxScaler(), ["Time"])
)
ct.fit(X_train)
X_train_normal = ct.transform(X_train)
X_test_normal = ct.transform(X_test)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
btc_model_4 = tf.keras.Sequential([
layers.Dense(100, activation="relu"),
layers.Dense(100, activation="relu"),
layers.Dense(100, activation="relu"),
layers.Dense(100, activation="relu"),
layers.Dense(100, activation="relu"),
layers.Dense(100, activation="relu"),
layers.Dense(1, activation="linear")
])
btc_model_4.compile(loss = tf.losses.MeanSquaredError(),
optimizer = tf.optimizers.Adam())
history = btc_model_4.fit(X_train_normal, y_train, batch_size=8192, epochs=100, callbacks=[callback])
btc_model_4.evaluate(X_test_normal, y_test, batch_size=8192)
y_pred = btc_model_4.predict(X_test_normal)
btc_model_4.save("btc_model_4")
btc_model_4.save("btc_model_4.h5")
# plot model
def plot_evaluations(train_data=X_train_normal,
train_labels=y_train,
test_data=X_test_normal,
test_labels=y_test,
predictions=y_pred):
print(test_data.shape)
print(predictions.shape)
plt.figure(figsize=(100, 15))
plt.scatter(train_data, train_labels, c='b', label="Training")
plt.scatter(test_data, test_labels, c='g', label="Testing")
plt.scatter(test_data, predictions, c='r', label="Results")
plt.legend()
plot_evaluations()
# plot loss curve
pd.DataFrame(history.history).plot()
plt.ylabel("loss")
plt.xlabel("epochs")
My normal data format is like so:
2015-12-05 12:52:00,377.48
2015-12-05 12:53:00,377.5
2015-12-05 12:54:00,377.5
2015-12-05 12:56:00,377.5
2015-12-05 12:57:00,377.5
2015-12-05 12:58:00,377.5
2015-12-05 12:59:00,377.5
2015-12-05 13:00:00,377.5
2015-12-05 13:01:00,377.79
2015-12-05 13:02:00,377.5
2015-12-05 13:03:00,377.79
2015-12-05 13:05:00,377.74
2015-12-05 13:06:00,377.79
2015-12-05 13:07:00,377.64
2015-12-05 13:08:00,377.79
2015-12-05 13:10:00,377.77
2015-12-05 13:11:00,377.7
2015-12-05 13:12:00,377.77
2015-12-05 13:13:00,377.77
2015-12-05 13:14:00,377.79
2015-12-05 13:15:00,377.72
2015-12-05 13:16:00,377.5
2015-12-05 13:17:00,377.49
2015-12-05 13:18:00,377.5
2015-12-05 13:19:00,377.5
2015-12-05 13:20:00,377.8
2015-12-05 13:21:00,377.84
2015-12-05 13:22:00,378.29
2015-12-05 13:23:00,378.3
2015-12-05 13:24:00,378.3
2015-12-05 13:25:00,378.33
2015-12-05 13:26:00,378.33
2015-12-05 13:28:00,378.31
2015-12-05 13:29:00,378.68
The first is the date and the second value after the comma is the price of BTC at that time. Now after "one-hot encoding", I added a print statement to print the value of those X values, and that gave the following value:
(0, 0) 1.0
(1, 1) 1.0
(2, 2) 1.0
(3, 3) 1.0
(4, 4) 1.0
(5, 5) 1.0
(6, 6) 1.0
(7, 7) 1.0
(8, 8) 1.0
(9, 9) 1.0
(10, 10) 1.0
(11, 11) 1.0
(12, 12) 1.0
(13, 13) 1.0
(14, 14) 1.0
(15, 15) 1.0
(16, 16) 1.0
(17, 17) 1.0
(18, 18) 1.0
(19, 19) 1.0
(20, 20) 1.0
(21, 21) 1.0
(22, 22) 1.0
(23, 23) 1.0
(24, 24) 1.0
: :
(2526096, 2526096) 1.0
(2526097, 2526097) 1.0
(2526098, 2526098) 1.0
(2526099, 2526099) 1.0
(2526100, 2526100) 1.0
(2526101, 2526101) 1.0
(2526102, 2526102) 1.0
(2526103, 2526103) 1.0
(2526104, 2526104) 1.0
(2526105, 2526105) 1.0
(2526106, 2526106) 1.0
(2526107, 2526107) 1.0
(2526108, 2526108) 1.0
(2526109, 2526109) 1.0
(2526110, 2526110) 1.0
(2526111, 2526111) 1.0
(2526112, 2526112) 1.0
(2526113, 2526113) 1.0
(2526114, 2526114) 1.0
(2526115, 2526115) 1.0
(2526116, 2526116) 1.0
(2526117, 2526117) 1.0
(2526118, 2526118) 1.0
(2526119, 2526119) 1.0
(2526120, 2526120) 1.0
Following fitting for normalization, I receive the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
408 try:
--> 409 all_columns = X.columns
410 except AttributeError:
5 frames
AttributeError: columns not found
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
410 except AttributeError:
411 raise ValueError(
--> 412 "Specifying the columns using strings is only "
413 "supported for pandas DataFrames"
414 )
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
Am I one-hot encoding correctly? What is the appropriate way to do this? Should I directly implement the one-hot encoder in my normalization process?
using OneHotEncoder is not the way to go here, it's better to extract the features from the column time as separate features like year, month, day, hour, minutes etc... and give these columns as input to your model.
btc_data['Year'] = btc_data['Date'].astype('datetime64[ns]').dt.year
btc_data['Month'] = btc_data['Date'].astype('datetime64[ns]').dt.month
btc_data['Day'] = btc_data['Date'].astype('datetime64[ns]').dt.day
the issue here is coming from the oneHotEncoder which is getting returning a scipy sparse matrix and get rides of the column "Time" so to correct this you must re-transform the output to a pandas dataframe and add the "Time" column.
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(X_btc)
X_btc = enc.transform(X_btc)
X_btc = pd.DataFrame(X_btc.todense())
X_btc["Time"] = btc_data["Time"]
one way to countournate the memory issue is :
X_train, X_test, y_train, y_test = train_test_split(X_btc, y_btc, test_size=0.2, random_state=62)
X_train_pd, X_test_pd, y_train_pd, y_test_pd = train_test_split(btc_data, y_btc, test_size=0.2, random_state=62)
ct = make_column_transformer((MinMaxScaler(), ["Time"]))
ct.fit(X_train_pd)
result_train = ct.transform(X_train_pd)
result_test = ct.transform(X_test_pd)
def nn_batch_generator(X_data, y_data, scaled, batch_size):
samples_per_epoch = X_data.shape[0]
number_of_batches = samples_per_epoch / batch_size
counter = 0
index = np.arange(np.shape(y_data)[0])
while True:
index_batch = index[batch_size * counter:batch_size * (counter + 1)]
scaled_array = scaled[index_batch]
X_batch = X_data[index_batch, :].todense()
y_batch = y_data.iloc[index_batch]
counter += 1
yield np.array(np.hstack((np.array(X_batch), scaled_array))), np.array(y_batch)
if (counter > number_of_batches):
counter = 0
def nn_batch_generator_test(X_data, scaled, batch_size):
samples_per_epoch = X_data.shape[0]
number_of_batches = samples_per_epoch / batch_size
counter = 0
index = np.arange(np.shape(X_data)[0])
while True:
index_batch = index[batch_size * counter:batch_size * (counter + 1)]
scaled_array = scaled[index_batch]
X_batch = X_data[index_batch, :].todense()
counter += 1
yield np.hstack((X_batch, scaled_array))
if (counter > number_of_batches):
counter = 0
Finally fit the model
history = btc_model_4.fit(nn_batch_generator(X_train, y_train, scaled=result_train, batch_size=2), steps_per_epoch=#Todetermine,
batch_size=2, epochs=10,
callbacks=[callback])
btc_model_4.evaluate(nn_batch_generator(X_test, y_test, scaled=result_test, batch_size=2), batch_size=2, steps=#Todetermine)
y_pred = btc_model_4.predict(nn_batch_generator_test(X_test, scaled=result_test, batch_size=2), steps=#Todetermine)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With