Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

ValueError after attempting to use OneHotEncoder and then normalize values with make_column_transformer

Tags:

So I was trying to convert my data's timestamps from Unix timestamps to a more readable date format. I created a simple Java program to do so and write to a .csv file, and that went smoothly. I tried using it for my model by one-hot encoding it into numbers and then turning everything into normalized data. However, after my attempt to one-hot encode (which I am not sure if it even worked), my normalization process using make_column_transformer failed.

# model 4
# next model
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tensorflow.keras import layers
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=3, suppress=True)
btc_data = pd.read_csv(
    "/content/drive/MyDrive/Science Fair/output2.csv",
    names=["Time", "Open"])

X_btc = btc_data[["Time"]]
y_btc = btc_data["Open"]

enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(X_btc)

X_btc = enc.transform(X_btc)

print(X_btc)

X_train, X_test, y_train, y_test = train_test_split(X_btc, y_btc, test_size=0.2, random_state=62)

ct = make_column_transformer(
    (MinMaxScaler(), ["Time"])
)

ct.fit(X_train)
X_train_normal = ct.transform(X_train)
X_test_normal = ct.transform(X_test)

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

btc_model_4 = tf.keras.Sequential([
  layers.Dense(100, activation="relu"),
  layers.Dense(100, activation="relu"),
  layers.Dense(100, activation="relu"),
  layers.Dense(100, activation="relu"),
  layers.Dense(100, activation="relu"),
  layers.Dense(100, activation="relu"),
  layers.Dense(1, activation="linear")
])

btc_model_4.compile(loss = tf.losses.MeanSquaredError(),
                      optimizer = tf.optimizers.Adam())

history = btc_model_4.fit(X_train_normal, y_train, batch_size=8192, epochs=100, callbacks=[callback])

btc_model_4.evaluate(X_test_normal, y_test, batch_size=8192)

y_pred = btc_model_4.predict(X_test_normal)

btc_model_4.save("btc_model_4")
btc_model_4.save("btc_model_4.h5")

# plot model
def plot_evaluations(train_data=X_train_normal,
                     train_labels=y_train,
                     test_data=X_test_normal,
                     test_labels=y_test,
                     predictions=y_pred):
  print(test_data.shape)
  print(predictions.shape)

  plt.figure(figsize=(100, 15))
  plt.scatter(train_data, train_labels, c='b', label="Training")
  plt.scatter(test_data, test_labels, c='g', label="Testing")
  plt.scatter(test_data, predictions, c='r', label="Results")
  plt.legend()

plot_evaluations()

# plot loss curve
pd.DataFrame(history.history).plot()
plt.ylabel("loss")
plt.xlabel("epochs")

My normal data format is like so:

2015-12-05 12:52:00,377.48
2015-12-05 12:53:00,377.5
2015-12-05 12:54:00,377.5
2015-12-05 12:56:00,377.5
2015-12-05 12:57:00,377.5
2015-12-05 12:58:00,377.5
2015-12-05 12:59:00,377.5
2015-12-05 13:00:00,377.5
2015-12-05 13:01:00,377.79
2015-12-05 13:02:00,377.5
2015-12-05 13:03:00,377.79
2015-12-05 13:05:00,377.74
2015-12-05 13:06:00,377.79
2015-12-05 13:07:00,377.64
2015-12-05 13:08:00,377.79
2015-12-05 13:10:00,377.77
2015-12-05 13:11:00,377.7
2015-12-05 13:12:00,377.77
2015-12-05 13:13:00,377.77
2015-12-05 13:14:00,377.79
2015-12-05 13:15:00,377.72
2015-12-05 13:16:00,377.5
2015-12-05 13:17:00,377.49
2015-12-05 13:18:00,377.5
2015-12-05 13:19:00,377.5
2015-12-05 13:20:00,377.8
2015-12-05 13:21:00,377.84
2015-12-05 13:22:00,378.29
2015-12-05 13:23:00,378.3
2015-12-05 13:24:00,378.3
2015-12-05 13:25:00,378.33
2015-12-05 13:26:00,378.33
2015-12-05 13:28:00,378.31
2015-12-05 13:29:00,378.68

The first is the date and the second value after the comma is the price of BTC at that time. Now after "one-hot encoding", I added a print statement to print the value of those X values, and that gave the following value:

  (0, 0)    1.0
  (1, 1)    1.0
  (2, 2)    1.0
  (3, 3)    1.0
  (4, 4)    1.0
  (5, 5)    1.0
  (6, 6)    1.0
  (7, 7)    1.0
  (8, 8)    1.0
  (9, 9)    1.0
  (10, 10)  1.0
  (11, 11)  1.0
  (12, 12)  1.0
  (13, 13)  1.0
  (14, 14)  1.0
  (15, 15)  1.0
  (16, 16)  1.0
  (17, 17)  1.0
  (18, 18)  1.0
  (19, 19)  1.0
  (20, 20)  1.0
  (21, 21)  1.0
  (22, 22)  1.0
  (23, 23)  1.0
  (24, 24)  1.0
  : :
  (2526096, 2526096)    1.0
  (2526097, 2526097)    1.0
  (2526098, 2526098)    1.0
  (2526099, 2526099)    1.0
  (2526100, 2526100)    1.0
  (2526101, 2526101)    1.0
  (2526102, 2526102)    1.0
  (2526103, 2526103)    1.0
  (2526104, 2526104)    1.0
  (2526105, 2526105)    1.0
  (2526106, 2526106)    1.0
  (2526107, 2526107)    1.0
  (2526108, 2526108)    1.0
  (2526109, 2526109)    1.0
  (2526110, 2526110)    1.0
  (2526111, 2526111)    1.0
  (2526112, 2526112)    1.0
  (2526113, 2526113)    1.0
  (2526114, 2526114)    1.0
  (2526115, 2526115)    1.0
  (2526116, 2526116)    1.0
  (2526117, 2526117)    1.0
  (2526118, 2526118)    1.0
  (2526119, 2526119)    1.0
  (2526120, 2526120)    1.0

Following fitting for normalization, I receive the following error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
    408         try:
--> 409             all_columns = X.columns
    410         except AttributeError:

5 frames
AttributeError: columns not found

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
    410         except AttributeError:
    411             raise ValueError(
--> 412                 "Specifying the columns using strings is only "
    413                 "supported for pandas DataFrames"
    414             )

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

Am I one-hot encoding correctly? What is the appropriate way to do this? Should I directly implement the one-hot encoder in my normalization process?

like image 837
Khosraw Azizi Avatar asked Nov 26 '21 00:11

Khosraw Azizi


1 Answers

using OneHotEncoder is not the way to go here, it's better to extract the features from the column time as separate features like year, month, day, hour, minutes etc... and give these columns as input to your model.

btc_data['Year'] = btc_data['Date'].astype('datetime64[ns]').dt.year
btc_data['Month'] = btc_data['Date'].astype('datetime64[ns]').dt.month
btc_data['Day'] = btc_data['Date'].astype('datetime64[ns]').dt.day
    

the issue here is coming from the oneHotEncoder which is getting returning a scipy sparse matrix and get rides of the column "Time" so to correct this you must re-transform the output to a pandas dataframe and add the "Time" column.

enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(X_btc)
X_btc = enc.transform(X_btc)
X_btc = pd.DataFrame(X_btc.todense())
X_btc["Time"] = btc_data["Time"]

one way to countournate the memory issue is :

  1. Generate two indexes with the same random_state, one for the pandas data frame and one for the scipy sparse matrix
X_train, X_test, y_train, y_test = train_test_split(X_btc, y_btc, test_size=0.2, random_state=62)
X_train_pd, X_test_pd, y_train_pd, y_test_pd = train_test_split(btc_data, y_btc, test_size=0.2, random_state=62)
  1. Use the pandas data frame for the MinMaxScaler().
   ct = make_column_transformer((MinMaxScaler(), ["Time"]))
   ct.fit(X_train_pd)
   result_train = ct.transform(X_train_pd)
   result_test = ct.transform(X_test_pd)
  1. Use generators for load data in train and test phase ( this will get ride of the memory issue ) and include the scaled time in the generators.
def nn_batch_generator(X_data, y_data, scaled, batch_size):
   samples_per_epoch = X_data.shape[0]
   number_of_batches = samples_per_epoch / batch_size
   counter = 0
   index = np.arange(np.shape(y_data)[0])
   while True:
       index_batch = index[batch_size * counter:batch_size * (counter + 1)]
       scaled_array = scaled[index_batch]
       X_batch = X_data[index_batch, :].todense()
       y_batch = y_data.iloc[index_batch]
       counter += 1
       yield np.array(np.hstack((np.array(X_batch), scaled_array))), np.array(y_batch)
       if (counter > number_of_batches):
           counter = 0


def nn_batch_generator_test(X_data, scaled, batch_size):
   samples_per_epoch = X_data.shape[0]
   number_of_batches = samples_per_epoch / batch_size
   counter = 0
   index = np.arange(np.shape(X_data)[0])
   while True:
       index_batch = index[batch_size * counter:batch_size * (counter + 1)]
       scaled_array = scaled[index_batch]
       X_batch = X_data[index_batch, :].todense()
       counter += 1
       yield np.hstack((X_batch, scaled_array))
       if (counter > number_of_batches):
           counter = 0

Finally fit the model


history = btc_model_4.fit(nn_batch_generator(X_train, y_train, scaled=result_train, batch_size=2), steps_per_epoch=#Todetermine,
                         batch_size=2, epochs=10,
                         callbacks=[callback])

btc_model_4.evaluate(nn_batch_generator(X_test, y_test, scaled=result_test, batch_size=2), batch_size=2, steps=#Todetermine)
y_pred = btc_model_4.predict(nn_batch_generator_test(X_test, scaled=result_test, batch_size=2), steps=#Todetermine)

like image 168
Ghassen Sultana Avatar answered Sep 30 '22 19:09

Ghassen Sultana