This is based on Joel Grus: Data Science from Scratch
Chapter 13

In [62]:
from typing import Set
import re
from typing import NamedTuple,Tuple
import math
from collections import defaultdict
from sklearn.metrics import ConfusionMatrixDisplay

In [63]:
def tokenize(text: str) -> Set[str]:
    text = text.lower()                         # Convert to lowercase,
    all_words = re.findall("[a-z0-9']+", text)  # extract the words, and
    return set(all_words)                       # remove duplicates.

tokenize("Data Science is science")

{'data', 'is', 'science'}

In [64]:
class Message(NamedTuple):
    text: str
    is_spam: bool


In [65]:
class NaiveBayesClassifier:
    def __init__(self, k: float = 0.5) -> None:
        self.k = k  # smoothing factor

        self.tokens: Set[str] = set()
        self.token_spam_counts: Dict[str, int] = defaultdict(int)
        self.token_ham_counts: Dict[str, int] = defaultdict(int)
        self.spam_messages = self.ham_messages = 0

    def train(self, messages) -> None:
        for message in messages:
            # Increment message counts
            if message.is_spam:
                self.spam_messages += 1
            else:
                self.ham_messages += 1

            # Increment word counts
            for token in tokenize(message.text):
                self.tokens.add(token)
                if message.is_spam:
                    self.token_spam_counts[token] += 1
                else:
                    self.token_ham_counts[token] += 1

    def _probabilities(self, token: str) -> Tuple[float, float]:
        """returns P(token | spam) and P(token | not spam)"""
        spam = self.token_spam_counts[token]
        ham = self.token_ham_counts[token]

        p_token_spam = (spam + self.k) / (self.spam_messages + 2 * self.k)
        p_token_ham = (ham + self.k) / (self.ham_messages + 2 * self.k)

        return p_token_spam, p_token_ham

    def predict(self, text: str) -> float:
        text_tokens = tokenize(text)
        # let's say a prior probability of a spam is 0.5
        log_prob_if_spam = log_prob_if_ham = math.log(0.5)

        # Iterate through each word in our vocabulary.
        for token in self.tokens:
            prob_if_spam, prob_if_ham = self._probabilities(token)

            # If *token* appears in the message,
            # add the log probability of seeing it;
            if token in text_tokens:
                log_prob_if_spam += math.log(prob_if_spam)
                log_prob_if_ham += math.log(prob_if_ham)

            # otherwise add the log probability of _not_ seeing it
            # which is log(1 - probability of seeing it)
            else:
                log_prob_if_spam += math.log(1.0 - prob_if_spam)
                log_prob_if_ham += math.log(1.0 - prob_if_ham)

        prob_if_spam = math.exp(log_prob_if_spam)
        prob_if_ham = math.exp(log_prob_if_ham)
        return prob_if_spam / (prob_if_spam + prob_if_ham)


In [66]:
messages = [Message("spam rules", is_spam=True),
            Message("ham rules", is_spam=False),
            Message("hello ham", is_spam=False)]

model = NaiveBayesClassifier(k=0.5)
model.train(messages)

In [67]:
model.predict("hello spam")

0.8350515463917526

In [68]:
model.predict("ham is the best food")

0.06323185011709603

In [70]:
import urllib

spamfile="http://www.iam.fmph.uniba.sk/ospm/Rosa/PDV/sacorpus-spam.txt"
hamfile="http://www.iam.fmph.uniba.sk/ospm/Rosa/PDV/sacorpus-ham.txt"

spamtext = urllib.request.urlopen(spamfile).read()
hamtext = urllib.request.urlopen(hamfile).read()



In [71]:
data = []

for line in hamtext.splitlines():
  data.append(Message(line.decode("utf-8"),False))

for line in spamtext.splitlines():
  data.append(Message(line.decode("utf-8"),True))

In [72]:
import random

random.seed(0)
random.shuffle(data)
cut = int(len(data) * 0.75)
train_messages = data[:cut]
test_messages = data[cut:]

train_messages


[Message(text='Subject: OO Programming Newsletter #41 from Bruce Eckel', is_spam=False),
 Message(text="Subject: [SAdev] [Bug 804] Razor debugging isn't functioning", is_spam=False),
 Message(text='Subject: Re: bad DCC traffic from e-corp.net', is_spam=False),
 Message(text="Subject: Living Love - Another legacy of the 60's", is_spam=False),
 Message(text='Subject: Re: DataPower announces XML-in-silicon', is_spam=False),
 Message(text='Subject: Re: ActiveBuddy', is_spam=False),
 Message(text='Subject: The MIME information you requested (last changed 3154 Feb 14)', is_spam=False),
 Message(text='Subject: Re: Working My_Mark2CurSeen', is_spam=False),
 Message(text="Subject: Steven Levy's wireless neighbors", is_spam=False),
 Message(text='Subject: [Spambayes] understanding high false negative rate', is_spam=False),
 Message(text='Subject: Re: flavor cystals', is_spam=False),
 Message(text='Subject: Blue people of the world unite!', is_spam=False),
 Message(text='Subject: Re: bad focus/cl

In [73]:
model = NaiveBayesClassifier()
model.train(train_messages)
predictions = [(message, model.predict(message.text)) for message in test_messages]

predictions[:5]

[(Message(text='Subject: Order your Viagra and weight-loss here 6117kFvc5--9', is_spam=True),
  0.9999999543893201),
 (Message(text='Subject: Re: How unlucky can you get?', is_spam=False),
  0.07118010873357666),
 (Message(text='Subject: [Spambayes] test sets?', is_spam=False),
  2.9399383491262537e-06),
 (Message(text='Subject: [Razor-users] Exit.status =13 ?', is_spam=False),
  7.013532601189357e-05),
 (Message(text='Subject: Dear Steve Jobs: At Macworld, show me this... [ANCHORDESK]', is_spam=False),
  0.23669589605852262)]

In [74]:
from collections import Counter
confusion_matrix = Counter((message.is_spam, spam_probability>0.5) for message, spam_probability in predictions)
print(confusion_matrix)
print((confusion_matrix[(False,False)]+confusion_matrix[(True,True)]) / len(predictions))

Counter({(False, False): 709, (True, True): 90, (True, False): 35, (False, True): 18})
0.937793427230047


In [75]:
def p_spam_given_token(token: str, model: NaiveBayesClassifier) -> float:
  # We probably shouldn't call private methods, but it's for a good cause.
  prob_if_spam, prob_if_ham = model._probabilities(token)
  return prob_if_spam / (prob_if_spam + prob_if_ham)

words = sorted(model.tokens, key=lambda t: p_spam_given_token(t, model))

print("spammiest_words", words[-10:])
print("hammiest_words", words[:10])

spammiest_words ['assistance', 'mortgage', 'guaranteed', 'clearance', 'norton', 'systemworks', 'sale', 'rates', 'money', 'adv']
hammiest_words ['spambayes', 'users', 'razor', 'zzzzteana', 'sadev', 'ouch', 'apt', 'perl', 'selling', 'bliss']


For most of this, you would use a library in practice: see https://scikit-learn.org/stable/modules/naive_bayes.html