9.16搜狗笔试-朴素贝叶斯
笔试的时候用numpy报错了,贼心不死,用python重写了一遍,还望各位大佬多多指正。
尝试将连续值离散化效果并不好,后来用正态分布算得概率分布,过题目给的测试用例好像是ok的。
本着学习的态度,和大家交流一下,攒一波人品,Q offer!!!
import sys import math # load data def loadDataSet(): line1 = sys.stdin.readline().strip().split() first_line = [int(item) for item in line1] train_data = [] train_label = [] for i in range(first_line[0]): line = sys.stdin.readline().strip().split() sample = [int(item) for item in line] label = sample.pop(0) train_label.append(label) train_data.append(sample) test_data = [] for j in range(first_line[1]): line = sys.stdin.readline().strip().split() line.pop(0) sample = [int(item) for item in line] test_data.append(sample) return train_data, train_label, test_data # 获得均值和方差 def getMeanAndVariance(matrix): numOfSamples = len(matrix) numOfFeatures = len(matrix[0]) mean = [0 for _ in range(numOfFeatures)] for i in range(numOfSamples): for j in range(numOfFeatures): mean[j] += matrix[i][j] mean = list(map(lambda x: x/numOfSamples, mean)) # 求得均值 variance = [0 for _ in range(numOfFeatures)] for i in range(numOfSamples): for j in range(numOfFeatures): variance[j] += (matrix[i][j] - mean[j]) ** 2 variance = list(map(lambda x: x / numOfSamples, variance)) # 求得方差 return [(m,v) for m,v in zip(mean,variance)] def trainNB(trainMatrix, trainLabels): numOfTrainSamples = len(trainMatrix) pAbusive = sum(trainLabels) / float(numOfTrainSamples) # 得到正样本的概率 samples0 = [] samples1 = [] for i in range(numOfTrainSamples): if trainLabels[i] == 1: samples1.append(trainMatrix[i]) else: samples0.append(trainMatrix[i]) mv0 = getMeanAndVariance(samples0) mv1 = getMeanAndVariance(samples1) return mv0, mv1, pAbusive # 得到高斯分布概率 def gaussianDistribution(m, v, x): zhishu = (x - m)**2/(2*v) xishu = 1/(math.sqrt(2*math.pi*v)) res = xishu*math.exp(-zhishu) return res def classifyNB(sample,mv0,mv1,pClass1): #比较概率大小进行判断 p1 = 1 p0 = 1 for (m,v),x in zip(mv1,sample): gd = gaussianDistribution(m, v, x) # 获得正太分布概率 p1 *= gd p1 *= pClass1 for (m,v),x in zip(mv0,sample): gd = gaussianDistribution(m, v, x) # 获得正太分布概率 p0 *= gd p0 *= (1-pClass1) if p1>p0: return 1 else: return 0 def testNB(): train_data, train_label, test_data = loadDataSet() mv0,mv1,pAb = trainNB(train_data,train_label) for test_sample in test_data: print(classifyNB(test_sample,mv0,mv1,pAb)) ''' 4 2 3 1 13 0 10 0 6 11 2 1 17 2 14 0 8 16 13 ? 20 3 19 ? 2 13 18 ''' if __name__ == '__main__': testNB()