Gaussian Naive Bayes

假设连续型随机变量服从高斯分布的朴素贝叶斯。发现自己实现的版本比sklearn的精度低了20%左右……研究了一下差在了哪里。

朴素贝叶斯

朴素贝叶斯是基于贝叶斯定理与特征条件独立假设的分类器。

原理

朴素贝叶斯通过给定训练集

训练学习到联合概率分布$P(X, Y)$,通过先验概率分布

和条件概率分布

学习到联合概率分布$P(X, Y)$

由特征相互独立假设,可得

分类时,对给定的输入$x$,模型计算$P(Y = c_k \mid X = x)$,将后验概率最大的类作为$x$的类输出,后验概率计算如下:

由于分母对任意的$c_k$都相同,故朴素贝叶斯分类器可以表示为:

参数估计

  1. 如果特征是离散型随机变量,可以使用频率用来估计概率。

    设第$j$个特征的取值的集合为${a_{j1}, a_{j2}, …, a_{js_j}}$,则

  2. 如果特征是连续型随机变量,可以假设正态分布来估计条件概率。

    这里$\mu_{c_k,j}$和$\sigma^2_{c_k,j}$分别为$Y = c_k$时,第$j$个特征的均值和方差。

代码

因为二值分类和$n$值分类是一样的,故以下代码只实现了$n$值分类的朴素贝叶斯分类器。
仓库:https://github.com/Davidham3/naive_bayes

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# -*- coding:utf-8 -*-
import numpy as np
from collections import defaultdict

def readDataSet(filename, frequency = 0, training_set_ratio = 0.7, shuffle = True):
'''
read the dataset file, and shuffle, remove all punctuations

Parameters
----------
filename: str, the filename of the data

frequency: int, you will select the words that appeared more than the frequency you specified
for example, if you set frequency equals 1, the program will return all words that they have
appeared more than once.

training_set_ratio: float, the ratio of training data account for in all data

shuffle: bool, whether to shuffle the data

Returns
----------
train_text: list, each element contains a tuple of words that in each sentence

train_labels: list, each element is the label of the corresponding sentence

test_text: list

test_labels: list
'''
with open(filename, 'r', encoding='utf-8') as f:
text = f.read().strip().split('\n')

if shuffle:
np.random.shuffle(text)

import re
# split all words by space and add them with their labels to the list "dataset"
dataset = []
for index, i in enumerate(text):
t = i.split('\t')
label = t[0]
t1 = re.sub("[\.\!\/_,$%^*(+\"\']+|[+——!,。??、~@#¥%……&*()]+", "", t[1])
dataset.append((label, re.split(re.compile('\s+'), t1)))

print("dataset's size is", len(dataset))

# split labels and words
labels, text = zip(*dataset)

split_line = int(len(text) * training_set_ratio)
train_text = text[:split_line]
train_labels = labels[:split_line]
test_text = text[split_line:]
test_labels = labels[split_line:]
return train_text, train_labels, test_text, test_labels

def preprocessing_training_data(text, labels):
'''
use bag of words to build features for training data

Parameters
----------
text: lists, each element contains a list of words in a sentence

labels: lists, each element is the label of the sample corresponding to the element in text

Returns
----------
trainX: ndarray, training data, the shape of it is (number of samples, number of features)

trainY: ndarray, labels of training data, the shape of it is (number of samples, )

words_table: dict, key is words, value is the index in bag of words

labels_table: dict, key is the label, value is the index that represents the corresponding label
'''
bag_of_words = tuple(set(word for words in text for word in words))
words_table = {i: index for index, i in enumerate(bag_of_words)}
trainX = np.empty((len(text), len(bag_of_words)))
for index, words in enumerate(text):
for word in words:
trainX[index, words_table[word]] += 1
labels_table = {i: index for index, i in enumerate(set(labels))}
trainY = np.array([labels_table[i] for i in labels])
return trainX, trainY, words_table, labels_table

def preprocessing_testing_data(text, labels, words_table, labels_table):
'''
use bag of words to build features for testing data

Parameters
----------
text: lists, each element contains a list of words in a sentence

labels: lists, each element is the label of the sample corresponding to the element in text

words_table: dict, key is words, value is the index in bag of words

labels_table: dict, key is the label, value is the index that represents the corresponding label

Returns
----------
testX: ndarray, testing data, the shape of it is (number of samples, number of features)

testY: ndarray, labels of testing data, the shape of it is (number of samples, )
'''
testX = np.empty((len(text), len(words_table)))
for index, words in enumerate(text):
for word in words:
col = words_table.get(word)
if col is not None:
testX[index, words_table[word]] += 1
testY = []
for i in labels:
l = labels_table.get(i)
if l is not None:
testY.append(l)
else:
labels_table[i] = len(labels_table)
testY.append(labels_table[i])
testY = np.array(testY)
return testX, testY

class GaussianNB:
'''
Gaussian naive bayes for continous features
'''
def __init__(self):
self.probability_of_y = {}
self.mean = {}
self.var = {}

def fit(self, trainX, trainY):
'''
use trainX and trainY to compute the prior probability for each class
and then compute the mean and variance for each features for each class

Parameters
----------
trainX: ndarray, training data, the shape of it is (number of samples, number of features)

trainY: ndarray, labels of training data, the shape of it is (number of samples, )
'''
labels = set(trainY.tolist())
for y in labels:
x = trainX[trainY == y, :]
self.probability_of_y[y] = x.shape[0] / trainX.shape[0]
self.mean[y] = x.mean(axis = 0)
var = x.var(axis = 0)
var[var == 0] += 1e-9 * var.max()
self.var[y] = var

def predict(self, testX):
'''
predict the labels of testX

Parameters
----------
testX: ndarray, testing data, the shape of it is (number of samples, number of features)

Returns
----------
ndarray: each element is a str variable, which represent the label of corresponding testing data
'''
results = np.empty((testX.shape[0], len(self.probability_of_y)))
labels = []
for index, (label, py) in enumerate(self.probability_of_y.items()):
t = np.exp(- ((testX - self.mean[label]) ** 2) / (2 * self.var[label])) / np.sqrt(2 * np.pi * self.var[label])
t[t == 0] = np.finfo(np.longdouble).eps
a = np.log(t)
results[:, index] = np.exp(np.sum(a, axis = 1)) * py
labels.append(label)
return np.array(labels)[np.argmax(results, axis = 1)]

def accuracy(prediction, testY):
'''
compute accuracy for prediction

Parameters
----------
prediction: ndarray, the prediction generated by the classifier

testY: ndarray, true labels

Returns
----------
float, accuracy
'''
return np.sum((prediction - testY) == 0) / testY.shape[0]

def main():
datadir = 'SMSSpamCollection'
train_text, train_labels, test_text, test_labels = readDataSet(datadir)
trainX, trainY, words_table, labels_table = preprocessing_training_data(train_text, train_labels)
print('training data shape:', trainX.shape, trainY.shape)
testX, testY = preprocessing_testing_data(test_text, test_labels, words_table, labels_table)
print('testing data shape:', testX.shape, testY.shape)
a = GaussianNB()
a.fit(trainX, trainY)
r = a.predict(testX)
print('accuracy:', accuracy(r, testY))

if __name__ == '__main__':
main()

按照上面的代码实现完后,和scikit-learn的Gaussian Naive Bayes做对比,发现精度查了20%左右……

然后就看了scikit-learn的实现,发现sklearn的实现将公式化简后实现的。

首先,我们的目标是:

因为求这个的时候可能会出现下溢的情况,也就是很多项都很小的时候,这个连乘就会出问题。

为了解决这个问题,我原来以为是先取对数,再取指数,这样值就不变了,但是值用取对数就行,因为取对数不改变单调性。

那目标就变成了:

在求条件概率的时候,也进行变换:

scikit-learn是实现的这个公式,我想了一下,这个公式比之前的那个公式要简洁很多,之前的公式中,如果求$e$的指数很小,就会出现0,如果手动补一个eps,又会影响结果。而且我们要对方差加eps,之前的公式中如果在方差上加了eps,求完指数运算后可能还要加eps,误差会逐渐的递增。

改进后的模型:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
class myGaussianNB:
'''
Gaussian naive bayes for continous features
'''
def __init__(self):
self.label_mapping = dict()
self.probability_of_y = {}
self.mean = {}
self.var = {}

def _clear(self):
self.label_mapping.clear()
self.probability_of_y.clear()
self.mean.clear()
self.var.clear()

def fit(self, trainX, trainY):
'''
use trainX and trainY to compute the prior probability for each class
and then compute the mean and variance for each features for each class

Parameters
----------
trainX: ndarray, training data, the shape of it is (number of samples, number of features)

trainY: ndarray, labels of training data, the shape of it is (number of samples, )
'''
self._clear()
labels = np.unique(trainY)
self.label_mapping = {label: index for index, label in enumerate(labels)}
for label in labels:
x = trainX[trainY == label, :]
self.probability_of_y[label] = x.shape[0] / trainX.shape[0]
self.mean[label] = x.mean(axis = 0, keepdims = True)
self.var[label] = x.var(axis = 0, keepdims = True) + 1e-9 * np.var(trainX, axis = 0).max()

def predict(self, testX):
'''
predict the labels of testX

Parameters
----------
testX: ndarray, testing data, the shape of it is (number of samples, number of features)

Returns
----------
ndarray: each element is a str variable, which represent the label of corresponding testing data
'''
results = np.empty((testX.shape[0], len(self.probability_of_y)))
labels = [0] * len(self.probability_of_y)
for label, index in self.label_mapping.items():
py = self.probability_of_y[label]
sum_of_conditional_probability = - 0.5 * np.sum(((testX - self.mean[label]) ** 2) / self.var[label], 1)
sum_of_conditional_probability += - 0.5 * np.sum(np.log(2 * np.pi * self.var[label]))
results[:, index] = sum_of_conditional_probability + np.log(py)
labels[index] = label
return np.array(labels)[np.argmax(results, axis = 1)]