H(x)是集成分类的结果,h(x)是单个基分类的结果,把他们都加起来然后经过一个sign函数,看是否超过半数
Hx≠fx就是说集成分类是错的,那么就是至少有一半的基分类器答错了,或者说是最多一半的基分类器答对了
import numpy as np
import pandas as pd
from sklearn.utils.multiclass import type_of_target
from collections import namedtuple
def train_nb(X, y):
m, n = X.shape#m是数据集数据总数,n是特征数量
p1 = (len(y[y == '是']) + 1) / (m + 2) # 拉普拉斯平滑,类C为好瓜,分母为样本总数+2
p1_list = [] # 用于保存正例下各属性的条件概率
p0_list = []#反例下各属性的条件概率
X1 = X[y == '是']
X0 = X[y == '否']#得到两类样本对应X的所在行数,即对应起来
m1, _ = X1.shape#正例数量
m0, _ = X0.shape#反例数量
for i in range(n):#开始遍历数据的n个属性
xi = X.iloc[:, i]#得到每个样本的属性值,这是一列的,m*1向量
p_xi = namedtuple(X.columns[i], ['is_continuous', 'conditional_pro']) # 用于储存每个变量的情况
#定义出一个元组,第一项是属性特征的名字,第二项为是否为连续值,以及其概率
is_continuous = type_of_target(xi) == 'continuous'
xi1 = X1.iloc[:, i]#得到正反类里在特征i上的取值
xi0 = X0.iloc[:, i]
if is_continuous: # 连续值时,conditional_pro 储存的就是 [mean, var] 即均值和方差
xi1_mean = np.mean(xi1)
xi1_var = np.var(xi1)
xi0_mean = np.mean(xi0)
xi0_var = np.var(xi0)
p1_list.append(p_xi(is_continuous, [xi1_mean, xi1_var]))
p0_list.append(p_xi(is_continuous, [xi0_mean, xi0_var]))
else: # 离散值时直接计算各类别的条件概率
unique_value = xi.unique() # 取值情况
nvalue = len(unique_value) # 取值个数
xi1_value_count = pd.Series(xi1).value_counts()[unique_value].fillna(0) + 1 # 计算正样本中,该属性每个取值的数量,并且加1,即拉普拉斯平滑
xi0_value_count = pd.Series(xi0).value_counts()[unique_value].fillna(0) + 1
p1_list.append(p_xi(is_continuous, np.log(xi1_value_count / (m1 + nvalue))))
p0_list.append(p_xi(is_continuous, np.log(xi0_value_count / (m0 + nvalue))))
return p1, p1_list, p0_list
def predict_nb(x, p1, p1_list, p0_list):
n = len(x)
x_p1 = np.log(p1)
x_p0 = np.log(1 - p1)
for i in range(n):
p1_xi = p1_list[i]
p0_xi = p0_list[i]
if p1_xi.is_continuous:
mean1, var1 = p1_xi.conditional_pro
mean0, var0 = p0_xi.conditional_pro
x_p1 += np.log(1 / (np.sqrt(2 * np.pi) * var1) * np.exp(- (x[i] - mean1) ** 2 / (2 * var1 ** 2)))
x_p0 += np.log(1 / (np.sqrt(2 * np.pi) * var0) * np.exp(- (x[i] - mean0) ** 2 / (2 * var0 ** 2)))
else:
x_p1 += p1_xi.conditional_pro[x[i]]
x_p0 += p0_xi.conditional_pro[x[i]]
if x_p1 > x_p0:
return '是'
else:
return '否'
if __name__ == '__main__':
data_path = r'D:\MachineLearning\pro1\watermelon3_0_Ch.csv'
data = pd.read_csv(data_path, index_col=0)
X=data.iloc[:,:-1]
y=data.iloc[:,-1]
p1, p1_list, p0_list = train_nb(X, y)
x_test = X.iloc[0, :]
print(predict_nb(x_test, p1, p1_list, p0_list))
因篇幅问题不能全部显示,请点此查看更多更全内容
Copyright © 2019- bangwoyixia.com 版权所有 湘ICP备2023022004号-2
违法及侵权请联系:TEL:199 1889 7713 E-MAIL:2724546146@qq.com
本站由北京市万商天勤律师事务所王兴未律师提供法律服务