import pandas as pd |
import numpy as np |
import re |
import nltk |
from nltk.corpus import stopwords |
from sklearn.model_selection import train_test_split |
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer |
from sklearn.svm import LinearSVC |
from sklearn.metrics import classification_report, confusion_matrix |
# 加载数据集 |
df = pd.read_csv( "spam.csv" , encoding = "ISO-8859-1" ) |
df = df.drop([ "Unnamed: 2" , "Unnamed: 3" , "Unnamed: 4" ], axis = 1 ) |
df = df.rename(columns = { "v1" : "label" , "v2" : "text" }) |
df[ "label" ] = np.where(df[ "label" ] = = "spam" , 1 , 0 ) |
# 预处理文本数据 |
stop_words = set (stopwords.words( "english" )) |
def preprocess_text(text): |
text = re.sub( "[^a-zA-Z]" , " " , text) # 去除非字母字符 |
text = text.lower() # 转换为小写 |
words = nltk.word_tokenize(text) # 分词 |
words = [word for word in words if word not in stop_words] # 去除停用词 |
return " " .join(words) |
df[ "text" ] = df[ "text" ]. apply (preprocess_text) |
# 划分数据集 |
X_train, X_test, y_train, y_test = train_test_split(df[ "text" ], df[ "label" ], test_size = 0.2 , random_state = 42 ) |
# 特征提取 |
count_vect = CountVectorizer() |
X_train_counts = count_vect.fit_transform(X_train) |
tfidf_transformer = TfidfTransformer() |
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) |
# 训练模型 |
clf = LinearSVC() |
clf.fit(X_train_tfidf, y_train) |
# 评估模型 |
X_test_counts = count_vect.transform(X_test) |
X_test_tfidf = tfidf_transformer.transform(X_test_counts) |
y_pred = clf.predict(X_test_tfidf) |
print ( "Confusion matrix:" ) |
print (confusion_matrix(y_test, y_pred)) |
print ( "Classification report:" ) |
print (classification_report(y_test, y_pred)) |
##################下面是数据一部分 |
v1,v2 |
ham,Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... |
ham,Ok lar... Joking wif u oni... |
spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C 's apply 08452810075over18' s |
ham,U dun say so early hor... U c already then say... |
ham,Nah I don't think he goes to usf, he lives around here though |
spam,XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> |
ham,Even my brother is not like to speak with me. They treat me like aids patent. |
ham,I HAVE A DATE ON SUNDAY WITH WILL!! |
spam,SIX chances to win CASH! From 100 to 20 , 000 pounds txt> CSH11 and send to 87575. Cost 150p / day, 6days , 16 + TsandCs apply Reply HL 4 info |
spam,URGENT! You have won a 1 week FREE membership in our £ 100 , 000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C LCCLTD POBOX 4403LDNW1A7RW18 |
ham,I prefer eating at the bar, so that I can watch my calories. |