第一句子网 > 利用Python实现多元伯努利事件的朴素贝叶斯分类器

利用Python实现多元伯努利事件的朴素贝叶斯分类器

时间：2022-08-13 23:07:49

前言

本篇博客所写的算法对应于吴恩达教授的机器学习教程里的多元伯努利事件模型的朴素贝叶斯。

多元伯努利事件模型的Python代码

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time : /9/415:55# @Author : DaiPuWei# E-Mail : 771830171@# @Site : 北教25实验室# @File : NaiveBayes.py# @Software: PyCharmimport numpy as np"""这份朴素贝叶斯的代码对应于吴恩达机器学习笔记中多元伯努利模型即每组词向量的每一维的取值为0或1。"""class NaiveBayes_Bernoulli(object):def __init__(self,Train_Data,Train_Label):"""这厮朴素贝叶斯的构造函数:param Train_Data: 训练数据:param Train_Label:训练标签"""# 生成字典self.VocabularyList = self.CreateVocabularyList(Train_Data)self.Train_Label = Train_Labelself.Train_Data = Train_Data# 文本数据向量化self.Train_Data_Vector = []for data in self.Train_Data:self.Train_Data_Vector.append(self.Word2Vector(data,self.VocabularyList))self.Train_Data_Vector = np.array(self.Train_Data_Vector)# 初始化朴素贝叶斯的相关参数self.py1 = 0 # p(y=1)self.py0 = 0 # p(y=0)col = len(self.VocabularyList)self.pj_y1 = np.array([0]*col) # p(x_j|y=1)self.pj_y0 = np.array([0]*col) # p(x_j|y=0)def CreateVocabularyList(self,dataset):"""这是利用数据集构造词汇表（字典）的函数:param dataset: 数据集"""# 初始化字典,VocabularyList = set([])for data in dataset:for _data in data:# 集合内不会包含重复元素VocabularyList.add(_data)return list(VocabularyList)def Word2Vector(self,input_data,VocabularyList):"""这是将一组数据向量化的函数:param input_data: 输入数据:param VocabularyList: 字典"""_data = [0]*len(VocabularyList)# enumerate(input_data)返回文字与对应下标的元组for (index,data) in enumerate(input_data):if data in VocabularyList:_data[index] = 1return _datadef Train(self):"""这是朴素贝叶斯分类器的训练函数，这里利用拉普拉斯平滑对结果进行修正"""# 计算标签为0和1的个数_py0 = self.Train_Label == 0_py1 = self.Train_Label == 1num_label0 = len(_py0)num_label1 = len(_py1)# 计算p(y=1)和p(y=0)self.py1 = num_label1/float(num_label0+num_label1)self.py0 = num_label0/float(num_label1+num_label0)# 计算p(x_j|y=1)和p(x_j|y=0)，加入了拉普拉斯平滑来修正结果p1_num = 2.0p0_num = 2.0for (index,data) in enumerate(self.Train_Data_Vector):# np.sum(data)为每数据中1的总和if self.Train_Label[index] == 1:self.pj_y1 += datap1_num += np.sum(data)else:self.pj_y0 += datap0_num += np.sum(data)self.pj_y1 = (self.pj_y1+1)*1.0/p1_numself.pj_y0 = (self.pj_y0+1)*1.0/p0_numdef predict(self,test_data):"""这是对一组数据进行预测的函数:param test_data: 一组测试函数,类型为np.array""""""由于p(x_j|y=1)和p(x_j|y=0)数组中的每一项大小过小，从而导致连乘过后导致数值下溢，因此我们采取折回策略，首先将其取对数后再做指数运算，获得近似结果。"""_p0 = np.exp(np.sum(test_data * np.log(self.pj_y0)) + np.log(self.py0))_p1 = np.exp(np.sum(test_data * np.log(self.pj_y1)) + np.log(self.py1))px = _p0+_p1p0 = float(_p0)/float(px)p1 = float(_p1)/float(px)print(p0)print(p1)if p1 >= p0:return 1else:return 0def Test(self,Test_Data):# 数据向量化Test_Data_Vector = []for test_data in Test_Data:Test_Data_Vector.append(self.Word2Vector(test_data,self.VocabularyList))Test_Predict_Label = []for test_data in Test_Data_Vector:Test_Predict_Label.append(self.predict(test_data))return Test_Predict_Label

测试代码

下面的代码的数据来自于《机器学习实战》。

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time : /9/910:23# @Author : DaiPuWei# E-Mail : 771830171@# @Site : 北教25实验室# @File : Bernoulli_demo.py# @Software: PyCharmimport numpy as npfrom NaiveBayes.NaiveBayes_Bernoulli import NaiveBayes_Bernoullidef run_main():"""这是主函数"""# 初始化训练数据与标签traindata = np.array([['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],['stop', 'posting', 'stupid', 'worthless', 'garbage'],['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']])trainlabel = np.array([0, 1, 0, 1, 0, 1]) # 1 is abusive, 0 nottestdata = np.array([['love', 'my', 'dalmation'],['stupid', 'garbage']])# 导入朴素贝叶斯分类器NB_Bernoulli= NaiveBayes_Bernoulli(traindata,trainlabel)# 训练朴素贝叶斯NB_Bernoulli.Train()# 预测predict_label = NB_Bernoulli.Test(testdata)print(predict_label)if __name__ == '__main__':run_main()

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。