import re |
import string |
import nltk |
nltk.download( 'stopwords' ) |
from nltk.corpus import stopwords |
def clean_text(text): |
text = re.sub(r '<.*?>' , '', text) # 去除HTML标签 |
text = re.sub(r 'http\S+' , '', text) # 去除URL链接 |
text = re.sub(r '\d+' , '', text) # 去除数字 |
text = text.translate( str .maketrans(' ', ' ', string.punctuation)) # 去除标点符号 |
text = text.lower() # 转换为小写字母 |
stopwords_set = set (stopwords.words( 'english' )) # 获取停用词集合 |
words = nltk.word_tokenize(text) # 分词 |
words = [w for w in words if w not in stopwords_set] # 去除停用词 |
text = ' ' .join(words) |
return text |