sklearn实现多种机器学习中的集成算法。 包括Adaboost,随机森林,梯度提升回归等算法
求各位进来的老铁帮个忙。。帮我把最后自己写的那个提升算法完善一下。。测试集该怎么测试准确率??? 求大佬补充
from sklearn.datasets import load_iris
# 用决策树作为基础模型
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
# bagging模型
def RandomForestBagging(X, y):
'''
随机森林
:param X:
:param y:
:return:
'''
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
rd = RandomForestClassifier(
n_estimators=100,
criterion="gini",
max_depth=4,
)
rd.fit(x_train, y_train)
print("随机森林的测试集上的准确率:", rd.score(x_test, y_test))
print("随机森林的训练集上的准确率:", rd.score(x_train, y_train))
# boosting模型汇总
def GardientBoosting(X, y):
'''
梯度提升算法
:return:
'''
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
gbrt = GradientBoostingClassifier(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(x_train, y_train)
print("梯度提升回归树的测试集上准确率:", gbrt.score(x_test, y_test))
print("梯度提升回归树的训练集上准确率:", gbrt.score(x_train, y_train))
def AdaBoosting(X, y):
'''
自适应提升算法
:param X:
:param y:
:return:
'''
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
ada = AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(max_depth=1),
n_estimators=100,
learning_rate=0.5,
algorithm='SAMME.R',
random_state=0
)
ada.fit(x_train, y_train)
print("自适应提升算法的测试集上准确率:", ada.score(x_test, y_test))
print("自适应提升算法的训练集上准确率:", ada.score(x_train, y_train))
# stacking模型汇总
def selfsuanfa(x_train, y_train):
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
SEED = 1
nb = GaussianNB()
svc = SVC(C=100, probability=True)
knn = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(C=100, random_state=SEED)
nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)
# 第一层定义好了
nb.fit(x_train, y_train)
data1 = nb.predict(x_train).reshape(-1, 1)
svc.fit(x_train, y_train)
data2 = svc.predict(x_train).reshape(-1, 1)
knn.fit(x_train, y_train)
data3 = knn.predict(x_train).reshape(-1, 1)
data_level1 = pd.DataFrame(data1, columns=['nb'])
data_level1['svc'] = data2
data_level1['knn'] = data3
data_level1['real'] = y_train
# print(data_level1)
# 第二层
X1 = data_level1[['nb', 'svc', 'knn']]
y1 = data_level1['real']
x_train1 = X1
y_train1 = y1
nn.fit(x_train1, y_train1)
data11 = nn.predict(x_train1).reshape(-1, 1)
data_level2 = pd.DataFrame(data11, columns=['nn'])
gb.fit(x_train1, y_train1)
data22 = gb.predict(x_train1).reshape(-1, 1)
data_level2['gb'] = data22
lr.fit(x_train1, y_train1)
data33 = lr.predict(x_train1).reshape(-1, 1)
data_level2['lr'] = data33
data_level2['real2'] = y_train1
# print(data_level2)
# 第三层
X2 = data_level2[['nn', 'gb', 'lr']]
y2 = data_level2[['real2']]
x_train2 = X2
y_train2 = y2
rf.fit(x_train2, y_train2)
print("最强集成算法的测试集上准确率:", rf.score(x_train2, y_train2))
if __name__ == '__main__':
iris = load_iris()
X = iris.data
y = iris.target
# model1(X, y)
RandomForestBagging(X, y) # 随机森林算法
GardientBoosting(X, y) # 梯度提升算法
AdaBoosting(X, y) # 自适应提升算法
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 训练集上的准确率
selfsuanfa(x_train, y_train) # 就这个算法,求老铁补充一下