[python]代码库
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
# 加载数据集
df = pd.read_csv("spam.csv", encoding="ISO-8859-1")
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df = df.rename(columns={"v1": "label", "v2": "text"})
df["label"] = np.where(df["label"] == "spam", 1, 0)
# 预处理文本数据
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
text = re.sub("[^a-zA-Z]", " ", text) # 去除非字母字符
text = text.lower() # 转换为小写
words = nltk.word_tokenize(text) # 分词
words = [word for word in words if word not in stop_words] # 去除停用词
return " ".join(words)
df["text"] = df["text"].apply(preprocess_text)
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)
# 特征提取
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# 训练模型
clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)
# 评估模型
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
y_pred = clf.predict(X_test_tfidf)
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification report:")
print(classification_report(y_test, y_pred))
##################下面是数据一部分
v1,v2
ham,Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham,Ok lar... Joking wif u oni...
spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham,U dun say so early hor... U c already then say...
ham,Nah I don't think he goes to usf, he lives around here though
spam,XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>>
ham,Even my brother is not like to speak with me. They treat me like aids patent.
ham,I HAVE A DATE ON SUNDAY WITH WILL!!
spam,SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info
spam,URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C LCCLTD POBOX 4403LDNW1A7RW18
ham,I prefer eating at the bar, so that I can watch my calories.