ctf-resources/thm/aoc23/day15/complete.py

#!/usr/bin/env python3
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

#Read the email dataset.
data = pd.read_csv('emails_dataset.csv')
df = pd.DataFrame(data)

#Convert text to numbers.
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Message'])

#Split the dataset for training and testing
y = df['Classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Train the model
clf = MultinomialNB()
clf.fit(X_train, y_train)

#Evaluate the model's performance
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

#Add a simple test to the model
message = vectorizer.transform(["Today's Offer! Claim ur $150 worth of discount vouchers! Text YES to 85023 now! SavaMob, member offers mobile! T Cs 08717898035. $3.00 Sub. 16 . Unsbub reply X"])
prediction = clf.predict(message)
print("The email is: ", prediction[0])

#Run the complete test of the emails inside "test_emails.csv"
test_data = pd.read_csv("test_emails.csv")

X_new = vectorizer.transform(test_data['Messages'])
new_predictions = clf.predict(X_new)
results_df = pd.DataFrame({'Messages': test_data['Messages'], 'Prediction': new_predictions})
print(results_df)
Added initial code. 2024-07-03 21:06:15 +02:00			`#!/usr/bin/env python3`
			`import numpy as np`
			`import pandas as pd`
			`from sklearn.feature_extraction.text import CountVectorizer`
			`from sklearn.model_selection import train_test_split`
			`from sklearn.naive_bayes import MultinomialNB`
			`from sklearn.metrics import classification_report`

			`#Read the email dataset.`
			`data = pd.read_csv('emails_dataset.csv')`
			`df = pd.DataFrame(data)`

			`#Convert text to numbers.`
			`vectorizer = CountVectorizer()`
			`X = vectorizer.fit_transform(df['Message'])`

			`#Split the dataset for training and testing`
			`y = df['Classification']`
			`X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)`

			`#Train the model`
			`clf = MultinomialNB()`
			`clf.fit(X_train, y_train)`

			`#Evaluate the model's performance`
			`y_pred = clf.predict(X_test)`
			`print(classification_report(y_test, y_pred))`

			`#Add a simple test to the model`
			`message = vectorizer.transform(["Today's Offer! Claim ur $150 worth of discount vouchers! Text YES to 85023 now! SavaMob, member offers mobile! T Cs 08717898035. $3.00 Sub. 16 . Unsbub reply X"])`
			`prediction = clf.predict(message)`
			`print("The email is: ", prediction[0])`

			`#Run the complete test of the emails inside "test_emails.csv"`
			`test_data = pd.read_csv("test_emails.csv")`

			`X_new = vectorizer.transform(test_data['Messages'])`
			`new_predictions = clf.predict(X_new)`
			`results_df = pd.DataFrame({'Messages': test_data['Messages'], 'Prediction': new_predictions})`
			`print(results_df)`