-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA512 Consider a pd dataframe with missing values that are in form of np.nan and let us call it "datacleanpd" Here I have proposed 8 different useful and cool methods that I have implemented into trial emulation package (as I named it Wakizashi model). Here are the data imputation techniques; ******************************************************************************************************* # method 1: Handle NaN values using mean of column imputer = SimpleImputer(strategy='mean') datacleanpd_imputed = pd.DataFrame(imputer.fit_transform(datacleanpd), columns=datacleanpd.columns) ******************************************************************************************************* ******************************************************************************************************* # method 2: Handle NaN values using mode (the most frequent value) imputer = SimpleImputer(strategy='most_frequent') datacleanpd_imputed = pd.DataFrame(imputer.fit_transform(datacleanpd), columns=datacleanpd.columns) ******************************************************************************************************* ******************************************************************************************************* # method 3: Handle NaN values using KNN imputation uses the k-nearest neighbors imputer = KNNImputer(n_neighbors=5) datacleanpd_imputed = pd.DataFrame(imputer.fit_transform(datacleanpd), columns=datacleanpd.columns) ******************************************************************************************************* ******************************************************************************************************* # method 4: Handle NaN values using Multivariate Imputation by Chained Equations (MICE) imputer = IterativeImputer() datacleanpd_imputed = pd.DataFrame(imputer.fit_transform(datacleanpd), columns=datacleanpd.columns) ******************************************************************************************************* ******************************************************************************************************* # method 5: Handle NaN values using linear interpolation datacleanpd_imputed = datacleanpd.interpolate(method='linear') ******************************************************************************************************* ******************************************************************************************************* # method 6: Handle NaN values using Regression Imputation datacleanpd_imputed = datacleanpd.copy() for column in datacleanpd.columns: if datacleanpd[column].isnull().any(): # Features and target not_nan = datacleanpd[datacleanpd[column].notnull()] nan = datacleanpd[datacleanpd[column].isnull()] # Train a model model = LinearRegression() model.fit(not_nan.drop(columns=[column]), not_nan[column]) # Predict missing values datacleanpd_imputed.loc[datacleanpd[column].isnull(), column] = model.predict(nan.drop(columns=[column])) ******************************************************************************************************* ******************************************************************************************************* # method 7: Handle NaN values using Multiple Imputation imputer = IterativeImputer(max_iter=10, sample_posterior=True) datacleanpd_imputed = [pd.DataFrame(imputer.fit_transform(datacleanpd), columns=datacleanpd.columns) for _ in range(50)] #combine the imputed datasets combined_imputed_data = pd.DataFrame(index=datacleanpd.index, columns=datacleanpd.columns) #calculate the mean of the imputed values for each column for column in datacleanpd.columns: combined_imputed_data[column] = np.mean([datacleanpd_imputed[column] for datacleanpd_imputed in datacleanpd_imputed], axis=0) datacleanpd_imputed = combined_imputed_data ******************************************************************************************************* ******************************************************************************************************* # method 8: Handle NaN values using autoencoders deep learning initial_imputer = SimpleImputer(strategy='mean') df_initial_imputed = pd.DataFrame(initial_imputer.fit_transform(datacleanpd), columns=datacleanpd.columns) # Prepare data scaler = MinMaxScaler() data_scaled = scaler.fit_transform(df_initial_imputed) input_dim = data_scaled.shape[1] input_layer = Input(shape=(input_dim,)) encoder = Dense(128, activation='relu')(input_layer) encoder = Dense(64, activation='relu')(encoder) decoder = Dense(128, activation='relu')(encoder) decoder = Dense(input_dim, activation='sigmoid')(decoder) autoencoder = Model(inputs=input_layer, outputs=decoder) autoencoder.compile(optimizer='adam', loss='mean_squared_error') #training the autoencoder autoencoder.fit(data_scaled, data_scaled, epochs=100, batch_size=32, shuffle=True, verbose=1) data_imputed_scaled = autoencoder.predict(data_scaled) data_imputed = scaler.inverse_transform(data_imputed_scaled) ******************************************************************************************************* - --msb -----BEGIN PGP SIGNATURE----- iQGzBAEBCgAdFiEEmc5dkFKoCH1oJc1UsiEV7ZcMwT0FAmZOLKUACgkQsiEV7ZcM wT3B8Av+JcWnITLnu8Km03uuRrgi/5Gxw/nik7ULK/lbvnzLD1pxipeUHUb1/j5i Kri87HSXd7VS9xTf21CNrSa8cd4v3xUbY9PaLCuzUNncXwaHAgT0dep3SKQ0/Bda i9u7pQqPa9opZzb8LylkhBSFElm0suEw7faIld6tQt/vQ744sTy18wQ8wHtlGKCw rJtHLw6hloVf9L5Lpzz9JxT4/tHQDzHPEm/xSyTmjkFZbsoU2E4Ir3UFzretAfGl 8ioTTrZLKhdDASakwTscpibtxghG1TXLn0EiLI5LnJMlePHtArLL0qNv3PR6EFop LZ33eMWVYa3rVPhlPHNzM6MxW9QQqK95J8qqVgT7AuJTi2VGP50OJPnreEFqtSVb k5mkLKWDX6/nNfiWiP/mAtr2lyl4RL9KrNPNWFoxQo2Y5aR4doD83XEQgN8ALmeb 2Ksujhv7f0ZC2zhi4Iv4/bGHD5smnyPEQwiKD8B4hYrJYG9qtaZQDSBS60MWa0bZ jgviLl0/ =yZQJ -----END PGP SIGNATURE-----