机器学习-5|机器学习-5 朴素贝叶斯【附代码】

返回主页
朴素贝叶斯算法(naive Bayes)是基于贝叶斯定理与特征条件独立假设的分类算法,在机器学习众多算法中,它实现简单、效率高,但代价是往往要牺牲一定的分类准确率。
从模型的分类上来讲,朴素贝叶斯算法属于概率模型(或生成模型),即它学习到的是数据生成的机制。详见笔记第一章 1.2.2
1、关于条件独立性假设


2、定义数据集

数据集
特征空间 3、假设空间

从贝叶斯定理推出假设函数
4、目标函数(最大似然估计)


5、优化算法


手写 naive Bayes 算法

# -*- coding: utf-8 -*- from __future__ import (absolute_import, division, print_function) import numpy as np import pandas as pd import scipy.spatial.distance as dist from scipy.stats import norm from sklearn.utils import shuffle from sklearn.model_selection import train_test_split #import random as rd #import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.datasets import load_irisclass NaiveBayesModel(object): def __init__(self, x_train, y_train, x_test): self.x_train = x_train self.y_train = y_train self.x_test = x_test self.N, self.n = x_train.shapedef prior_probs(self): '''计算标记值的先验概率(含拉普拉斯平滑): 本案设为三分类问题''' labels, labels_count = np.unique(self.y_train, return_counts=True) label_0, label_1, label_2 = labels[0], labels[1], labels[2] prior_probs_0, prior_probs_1, prior_probs_2 = (labels_count + 1) / (labels_count.sum() + self.n) # 类别标记 label_dict = {"label_0": label_0, "label_1": label_1, "label_2": label_2} # 类别标记对应的先验概率 prior_probs_dict = {"prior_probs_0": prior_probs_0, "prior_probs_1": prior_probs_1, "prior_probs_2": prior_probs_2} return label_dict, prior_probs_dictdef likelihood_probs(self, label): '''计算每个样本的似然概率(含拉普拉斯平滑)''' # 初始化似然概率数组 likeli_probs_array = np.array([]) # 行循环 for i in range(len(self.x_test)): # 初始化似然概率的联合分布 likeli_probs_res = 1.0 # 列循环 for j in range(self.n): val = self.x_test.iloc[i, j] # 若字段取值为分类变量,则计算条件概率,加拉普拉斯平滑 if isinstance(val, str): fen_zi = len(self.x_train.loc[(self.x_train.iloc[:, j] == val) & (y_train == label)]) + 1 fen_mu = sum(self.y_train == label) + self.n likeli_probs = fen_zi / fen_mu # 若字段取值为连续变量,则依据正态分布计算条件概率密度 elif isinstance(val, float): vals = x_train[self.y_train == label].iloc[:, j] mean = np.mean(vals) std = np.std(vals) likeli_probs = norm.pdf(val, mean, std) else: raise TypeError("Type must be as str or float.") # 计算似然概率的联合分布 likeli_probs_res *= likeli_probs # 保持似然概率 likeli_probs_array = np.append(likeli_probs_array, likeli_probs_res) return likeli_probs_arraydef predict(self, prior_probs_0, prior_probs_1, prior_probs_2, likeli_probs_array_0, likeli_probs_array_1, likeli_probs_array_2): '''模型预测, 计算后验概率, 取最大化''' posterior_probs_0 = (prior_probs_0 * likeli_probs_array_0).reshape(-1, 1) posterior_probs_1 = (prior_probs_1 * likeli_probs_array_1).reshape(-1, 1) posterior_probs_2 = (prior_probs_2 * likeli_probs_array_2).reshape(-1, 1)posterior_probs = np.concatenate([posterior_probs_0, posterior_probs_1, posterior_probs_2], axis=1) # 最大后验概率 y_pred = np.argmax(posterior_probs, axis=1) return y_preddef get_score(self, y_true, y_pred): '''模型评估''' score = sum(y_true == y_pred) / len(y_true) return scoreif __name__ == "__main__": # 读取数据 iris = load_iris() dataSet = pd.DataFrame(iris.data, columns=iris.feature_names) dataSet.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width"] # 构造三分类变量 dataSet["sepal_length_mult"] = pd.cut(dataSet.sepal_length, bins=3, right=True, include_lowest=True, labels=["low", "med", "high"])dataSet["sepal_width_mult"] = pd.cut(dataSet.sepal_width, bins=3, right=True, include_lowest=True, labels=["low", "med", "high"]) # 构造二分类变量 dataSet["petal_length_bina"] = pd.cut(dataSet.petal_length, bins=2, right=True, include_lowest=True, labels=["small", "big"])dataSet["petal_width_mult"] = pd.cut(dataSet.petal_width, bins=2, right=True, include_lowest=True, labels=["small", "big"]) # 标记值 dataSet["label"] = iris.target # 洗牌 dataSet = shuffle(dataSet) # , random_state=0x = dataSet.iloc[:, 0:8] y = dataSet.iloc[:, 8]x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)# 手写模型 model = NaiveBayesModel(x_train, y_train, x_test) label_dict, prior_probs_dict = model.prior_probs()label_0 = label_dict.get("label_0") label_1 = label_dict.get("label_1") label_2 = label_dict.get("label_2")prior_probs_0 = prior_probs_dict.get("prior_probs_0") prior_probs_1 = prior_probs_dict.get("prior_probs_1") prior_probs_2 = prior_probs_dict.get("prior_probs_2")likeli_probs_array_0 = model.likelihood_probs(label=label_0) likeli_probs_array_1 = model.likelihood_probs(label=label_1) likeli_probs_array_2 = model.likelihood_probs(label=label_2)y_pred = model.predict(prior_probs_0, prior_probs_1, prior_probs_2, likeli_probs_array_0, likeli_probs_array_1, likeli_probs_array_2)score = model.get_score(y_test, y_pred) print(f"NaiveBayesModel 预测准确率:{score}")

NaiveBayesModel 预测准确率:0.9333333333333333
【机器学习-5|机器学习-5 朴素贝叶斯【附代码】】返回主页

    推荐阅读