<span>天池-幸福感预测(简单回归问题)</span>
本次的幸福感预测是一个常见的回归问题,我是采用的普通的全连接网络来解决这个问题(由于技术原因,目前结果不理想,正在努力优化进入前500)。
简单回归问题采用普通全连接网络的一个基本思维导图(制作软件:幕布)
我解决本次问题的代码如下(采用的keras):
1 import tensorflow as tf 2 from tensorflow import keras 3 import pandas as pd 4 import numpy as np 5 from tensorflow.keras import models, layers, Sequential 6 from tensorflow.keras.layers import Dense, Dropout 7 from matplotlib import pyplot as plt 8 from tensorflow.keras.regularizers import l2 9 from tensorflow.keras import optimizers 10 import math 11 12 def getData(): 13 train_data = pd.read_csv('./happiness_train_abbr.csv', header=0, encoding='ISO-8859-1') 14 test_data = pd.read_csv('./happiness_test_abbr.csv', header=0, encoding='ISO-8859-1') 15 y_Id = test_data['id'] 16 train_labels = np.where(train_data['happiness'] != -8, train_data['happiness'], np.mean(train_data['happiness'])) 17 features = train_data.columns 18 #特征选择 19 feature_col = [col for col in features if col not in ['id', 'happiness', 'survey_time', 'work_status', 'work_yr', 'work_type', 'work_manage']] 20 x_data = train_data[feature_col] 21 x_test = test_data[feature_col] 22 x_data['income'][x_data['income'] < 0] = 0 23 x_test['income'][x_test['income'] < 0] = 0 24 x_data['political'][x_data['political'] < 0] = 1 25 x_test['political'][x_test['political'] < 0] = 1 26 x_data['health_problem'][x_data['health_problem'] < 0] = 0 27 x_test['health_problem'][x_test['health_problem'] < 0] = 0 28 x_data['depression'][x_data['depression'] < 0] = 1 29 x_test['depression'][x_test['depression'] < 0] = 1 30 x_data['equity'][x_data['equity'] < 0] = 1 31 x_test['equity'][x_test['equity'] < 0] = 1 32 x_data['class'][x_data['class'] < 0] = 1 33 x_test['class'][x_test['class'] < 0] = 1 34 x_data['family_income'][x_data['family_income'] < 0] = 0 35 x_test['family_income'][x_test['family_income'] < 0] = 0 36 x_data['family_m'][x_data['family_m'] <= 0] = 1 37 x_test['family_m'][x_test['family_m'] <= 0] = 1 38 x_data['family_status'][x_data['family_status'] <= 0] = 1 39 x_test['family_status'][x_test['family_status'] <= 0] = 1 40 x_data['house'][x_data['house'] < 0] = 0 41 x_test['house'][x_test['house'] < 0] = 0 42 x_data['status_peer'][x_data['status_peer'] < 0] = 1 43 x_test['status_peer'][x_test['status_peer'] < 0] = 1 44 x_data['status_3_before'][x_data['status_3_before'] < 0] = 1 45 x_test['status_3_before'][x_test['status_3_before'] < 0] = 1 46 x_data['view'][x_data['view'] < 0] = 1 47 x_test['view'][x_test['view'] < 0] = 1 48 x_data['inc_ability'][x_data['inc_ability'] < 0] = 1 49 x_test['inc_ability'][x_test['inc_ability'] < 0] = 1 50 x_data = x_data.fillna(x_data.median()) 51 x_test = x_test.fillna(x_test.median()) 52 return x_data, train_labels, x_test, y_Id 53 54 def train_Model(inputShape): 55 rms = optimizers.RMSprop(lr=0.0002, rho=0.9, epsilon=None, decay=0.0) 56 model = Sequential([Dense(64, activation='relu', kernel_regularizer=l2(0.0003), input_shape=inputShape), 57 Dropout(0.2), 58 Dense(64, activation='relu', kernel_regularizer=l2(0.0003)), 59 Dropout(0.1), 60 Dense(1)]) 61 model.compile(optimizer=rms, loss='mse', metrics=['mae']) 62 return model 63 64 def val(x_data, y_data): 65 num_epoch = 500 66 k = 4 67 all_socre = [] 68 num_val_sample = int(len(x_data) / k) 69 all_mae_histories = [] 70 for i in range(k): 71 print('processing fold #', i) 72 val_data = x_data[i * num_val_sample: (i + 1) * num_val_sample] 73 val_targets = y_data[i * num_val_sample: (i + 1) * num_val_sample] 74 75 partial_train_data = np.concatenate( 76 [x_data[:i * num_val_sample], 77 x_data[(i + 1) * num_val_sample:]], 78 axis=0) 79 partial_train_targets = np.concatenate( 80 [y_data[:i * num_val_sample], 81 y_data[(i + 1) * num_val_sample:]], 82 axis=0) 83 model = train_Model(x_data[1].shape) 84 85 history = model.fit(partial_train_data, partial_train_targets, 86 validation_data=(val_data, val_targets), 87 epochs=num_epoch, batch_size=200, verbose=0) 88 mae_history = history.history['val_mae'] 89 all_mae_histories.append(mae_history) 90 print(all_mae_histories) 91 average_mae_history = [ 92 np.mean([x[i] for x in all_mae_histories]) for i in range(num_epoch)] 93 94 plt.plot(range(1, len(average_mae_history) + 1), average_mae_history) 95 plt.xlabel('Epochs') 96 plt.ylabel('Validation MAE') 97 plt.show() 98 99 100 if __name__ == '__main__': 101 x_data, y_data, x_test, y_id=getData() 102 x_data = np.array(x_data)#(8000, 35) 103 y_data = np.array(y_data)#(8000,) 104 x_test = np.array(x_test)#(2968, 35) 105 #让其均值为0,标准差为1 106 mean_test = x_test.mean(axis=0) 107 x_test -= mean_test 108 std_test = x_test.std(axis=0) 109 x_test /= std_test 110 mean_train = x_data.mean(axis=0) 111 x_data -= mean_train 112 std_train = x_data.std(axis=0) 113 x_data /= std_train 114 #val(x_data, y_data) 115 model = train_Model(x_data[0].shape) 116 hist = model.fit(x_data, y_data, epochs=3000, batch_size=512, verbose=0) 117 print(hist.history['loss']) 118 y = hist.history['loss'] 119 x = np.arange(1, len(y) + 1) 120 plt.title("Loss") 121 plt.xlabel("epoch") 122 plt.ylabel("loss") 123 plt.plot(x, y) 124 plt.show() 125 model.save('my_model.h5') 126 y_price = model.predict(x_test) 127 test_price = pd.DataFrame(y_price, columns=['happiness']) 128 result = pd.concat([y_id, test_price], axis=1) 129 result.to_csv('result.csv', sep=',', index=None)
目前的天池优化的结果:
第二次 和 第三次的结果是在第一次的基础上增加了 epoch,虽然在训练上的结果越来越好,甚至一度达到了0.19,但却导致了过拟合。
第四次是在仔细看过损失函数的改变过程后发现loss一直在一个区间里面跳动,调整了learnRate大小,再增加了正则化和dropout层后,达到了目前的效果。
目前我正在考虑增加数据的特征维度和batch normalization 来进行优化。(争取进入前500/6524,目前1064/6524),冲冲冲!
ps:目前只是个新手,有啥不对的,希望看到的大神能多多指点,谢谢!