<span>天池-幸福感预测(简单回归问题)</span>

  本次的幸福感预测是一个常见的回归问题,我是采用的普通的全连接网络来解决这个问题(由于技术原因,目前结果不理想,正在努力优化进入前500)。

  简单回归问题采用普通全连接网络的一个基本思维导图(制作软件:幕布)

 

我解决本次问题的代码如下(采用的keras):

  1 import tensorflow as tf
  2 from tensorflow import keras
  3 import pandas as pd
  4 import numpy as np
  5 from tensorflow.keras import models, layers, Sequential
  6 from tensorflow.keras.layers import Dense, Dropout
  7 from matplotlib import pyplot as plt
  8 from tensorflow.keras.regularizers import l2
  9 from tensorflow.keras import optimizers
 10 import math
 11 
 12 def getData():
 13     train_data = pd.read_csv('./happiness_train_abbr.csv', header=0, encoding='ISO-8859-1')
 14     test_data = pd.read_csv('./happiness_test_abbr.csv', header=0, encoding='ISO-8859-1')
 15     y_Id = test_data['id']
 16     train_labels = np.where(train_data['happiness'] != -8, train_data['happiness'], np.mean(train_data['happiness']))
 17     features = train_data.columns
 18     #特征选择
 19     feature_col = [col for col in features if col not in ['id', 'happiness', 'survey_time', 'work_status', 'work_yr', 'work_type', 'work_manage']]
 20     x_data = train_data[feature_col]
 21     x_test = test_data[feature_col]
 22     x_data['income'][x_data['income'] < 0] = 0
 23     x_test['income'][x_test['income'] < 0] = 0
 24     x_data['political'][x_data['political'] < 0] = 1
 25     x_test['political'][x_test['political'] < 0] = 1
 26     x_data['health_problem'][x_data['health_problem'] < 0] = 0
 27     x_test['health_problem'][x_test['health_problem'] < 0] = 0
 28     x_data['depression'][x_data['depression'] < 0] = 1
 29     x_test['depression'][x_test['depression'] < 0] = 1
 30     x_data['equity'][x_data['equity'] < 0] = 1
 31     x_test['equity'][x_test['equity'] < 0] = 1
 32     x_data['class'][x_data['class'] < 0] = 1
 33     x_test['class'][x_test['class'] < 0] = 1
 34     x_data['family_income'][x_data['family_income'] < 0] = 0
 35     x_test['family_income'][x_test['family_income'] < 0] = 0
 36     x_data['family_m'][x_data['family_m'] <= 0] = 1
 37     x_test['family_m'][x_test['family_m'] <= 0] = 1
 38     x_data['family_status'][x_data['family_status'] <= 0] = 1
 39     x_test['family_status'][x_test['family_status'] <= 0] = 1
 40     x_data['house'][x_data['house'] < 0] = 0
 41     x_test['house'][x_test['house'] < 0] = 0
 42     x_data['status_peer'][x_data['status_peer'] < 0] = 1
 43     x_test['status_peer'][x_test['status_peer'] < 0] = 1
 44     x_data['status_3_before'][x_data['status_3_before'] < 0] = 1
 45     x_test['status_3_before'][x_test['status_3_before'] < 0] = 1
 46     x_data['view'][x_data['view'] < 0] = 1
 47     x_test['view'][x_test['view'] < 0] = 1
 48     x_data['inc_ability'][x_data['inc_ability'] < 0] = 1
 49     x_test['inc_ability'][x_test['inc_ability'] < 0] = 1
 50     x_data = x_data.fillna(x_data.median())
 51     x_test = x_test.fillna(x_test.median())
 52     return x_data, train_labels, x_test, y_Id
 53 
 54 def train_Model(inputShape):
 55     rms = optimizers.RMSprop(lr=0.0002, rho=0.9, epsilon=None, decay=0.0)
 56     model = Sequential([Dense(64, activation='relu', kernel_regularizer=l2(0.0003), input_shape=inputShape),
 57                         Dropout(0.2),
 58                        Dense(64, activation='relu', kernel_regularizer=l2(0.0003)),
 59                         Dropout(0.1),
 60                        Dense(1)])
 61     model.compile(optimizer=rms, loss='mse', metrics=['mae'])
 62     return model
 63 
 64 def val(x_data, y_data):
 65     num_epoch = 500
 66     k = 4
 67     all_socre = []
 68     num_val_sample = int(len(x_data) / k)
 69     all_mae_histories = []
 70     for i in range(k):
 71         print('processing fold #', i)
 72         val_data = x_data[i * num_val_sample: (i + 1) * num_val_sample]
 73         val_targets = y_data[i * num_val_sample: (i + 1) * num_val_sample]
 74 
 75         partial_train_data = np.concatenate(
 76             [x_data[:i * num_val_sample],
 77              x_data[(i + 1) * num_val_sample:]],
 78             axis=0)
 79         partial_train_targets = np.concatenate(
 80             [y_data[:i * num_val_sample],
 81              y_data[(i + 1) * num_val_sample:]],
 82             axis=0)
 83         model = train_Model(x_data[1].shape)
 84 
 85         history = model.fit(partial_train_data, partial_train_targets,
 86                             validation_data=(val_data, val_targets),
 87                             epochs=num_epoch, batch_size=200, verbose=0)
 88         mae_history = history.history['val_mae']
 89         all_mae_histories.append(mae_history)
 90     print(all_mae_histories)
 91     average_mae_history = [
 92         np.mean([x[i] for x in all_mae_histories]) for i in range(num_epoch)]
 93 
 94     plt.plot(range(1, len(average_mae_history) + 1), average_mae_history)
 95     plt.xlabel('Epochs')
 96     plt.ylabel('Validation MAE')
 97     plt.show()
 98 
 99 
100 if __name__ == '__main__':
101     x_data, y_data, x_test, y_id=getData()
102     x_data = np.array(x_data)#(8000, 35)
103     y_data = np.array(y_data)#(8000,)
104     x_test = np.array(x_test)#(2968, 35)
105     #让其均值为0,标准差为1
106     mean_test = x_test.mean(axis=0)
107     x_test -= mean_test
108     std_test = x_test.std(axis=0)
109     x_test /= std_test
110     mean_train = x_data.mean(axis=0)
111     x_data -= mean_train
112     std_train = x_data.std(axis=0)
113     x_data /= std_train
114     #val(x_data, y_data)
115     model = train_Model(x_data[0].shape)
116     hist = model.fit(x_data, y_data, epochs=3000, batch_size=512, verbose=0)
117     print(hist.history['loss'])
118     y = hist.history['loss']
119     x = np.arange(1, len(y) + 1)
120     plt.title("Loss")
121     plt.xlabel("epoch")
122     plt.ylabel("loss")
123     plt.plot(x, y)
124     plt.show()
125     model.save('my_model.h5')
126     y_price = model.predict(x_test)
127     test_price = pd.DataFrame(y_price, columns=['happiness'])
128     result = pd.concat([y_id, test_price], axis=1)
129     result.to_csv('result.csv', sep=',', index=None)

 

目前的天池优化的结果:

 

第二次 和 第三次的结果是在第一次的基础上增加了 epoch,虽然在训练上的结果越来越好,甚至一度达到了0.19,但却导致了过拟合。

第四次是在仔细看过损失函数的改变过程后发现loss一直在一个区间里面跳动,调整了learnRate大小,再增加了正则化和dropout层后,达到了目前的效果。

目前我正在考虑增加数据的特征维度和batch normalization 来进行优化。(争取进入前500/6524,目前1064/6524),冲冲冲!

ps:目前只是个新手,有啥不对的,希望看到的大神能多多指点,谢谢!

 

全部评论

相关推荐

评论
点赞
收藏
分享
牛客网
牛客企业服务