Naive Bayes
Model assumption: Independence of features (No correlation between features), e.g. \(P(\text{Rain}\wedge\text{Hot})=P(\text{Rain})\cdot P(\text{Hot})\).
Events are described by a feature vector \(\mathbf{X}\) of length \(n\), where \(X_i\) is the value of the \(i\)-th feature. Outcome is described by a prediction-variable \(Y\in \text{{0, 1}}\).
Estimations: \[P(X|Y=0) := \prod_{i=1}^n P(X_i | Y=0)\] \[P(X|Y=1) := \prod_{i=1}^n P(X_i | Y=1)\] \[P(Y=0 | X) := \frac{P(X|Y) \cdot P(Y=0)}{P(X)} \] \[P(Y=1 | X) := \frac{P(X|Y) \cdot P(Y=1)}{P(X)}\]
If \(P(Y = 0 | X)\) > \(P(Y = 1 | X)\), then \(Y = 0\), otherwise predict \(Y = 1\).
Laplace Smoothing
If a feature value is not present in the training set, then the probability will be zero. To avoid this, we can use Laplace smoothing, which adds a small constant \(\alpha = 1\) to the numerator and \(2\alpha\) to the denominator.
Classification Example
from load_data import loadData
def train(training_mails, training_labels):
"""
Train the model on the training data.
:param training_mails: A matrix where the rows are the mails and the columns are the words.
:param training_labels: A vector where the i-th entry is the label of the i-th mail. 1 means spam, 0 means non-spam.
"""
spam_mails = training_mails[training_labels == 1]
non_spam_mails = training_mails[training_labels == 0]
spam_token_probabilities = spam_mails.sum(axis=0) / spam_mails.sum(axis=(0, 1))
non_spam_token_probabilities = non_spam_mails.sum(axis=0) / non_spam_mails.sum(axis=(0, 1))
spam_probability = spam_mails.shape[0] / training_mails.shape[0]
non_spam_probability = non_spam_mails.shape[0] / training_mails.shape[0]
return {
"spam_token_probabilities": spam_token_probabilities,
"non_spam_token_probabilities": non_spam_token_probabilities,
"spam_probability": spam_probability,
"non_spam_probability": non_spam_probability
}
def calculate_probabilities(testing_mails, token_probabilities):
"""
Calculate the probability of the mail given it's spam or non-spam.
:param testing_mails: A matrix where the rows are the mails and the columns are the words.
:param token_probabilities: The probabilities of the tokens given the mail is spam or non-spam.
:return: The probability of the mail given it's spam or non-spam.
"""
return (token_probabilities ** testing_mails).prod(axis=1)
def test(testing_mails, testing_labels, model):
"""
Test the model on the testing data.
:param testing_mails: A matrix where the rows are the mails and the columns are the words.
:param testing_labels: A vector where the i-th entry is the label of the i-th mail. 1 means spam, 0 means non-spam.
:return: The accuracy of the model.
"""
spam_token_probabilities, non_spam_token_probabilities, spam_probability, non_spam_probability = model.values()
spam_probability = calculate_probabilities(testing_mails, spam_token_probabilities) * spam_probability
non_spam_probability = calculate_probabilities(testing_mails, non_spam_token_probabilities) * non_spam_probability
predicted_labels = (spam_probability > non_spam_probability).astype(int)
accuracy = (predicted_labels == testing_labels).sum() / len(testing_labels)
print(f"Predicted labels: {predicted_labels}, actual labels: {testing_labels}")
print("Acccuracy: %.2f%% => Error-Rate: %.2f%%" % (accuracy * 100, (1 - accuracy) * 100))
return accuracy
if __name__ == "__main__":
loaded_data = loadData()
model = train(
training_mails=loaded_data['trainMatrixEx1'],
training_labels=loaded_data['trainLabelsEx1']
)
test(
testing_mails=loaded_data['testMatrixEx1'],
testing_labels=loaded_data['testLabelsEx1'],
model=model
)