[python]代码库
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def clean_text(text):
text = re.sub(r'<.*?>', '', text) # 去除HTML标签
text = re.sub(r'http\S+', '', text) # 去除URL链接
text = re.sub(r'\d+', '', text) # 去除数字
text = text.translate(str.maketrans('', '', string.punctuation)) # 去除标点符号
text = text.lower() # 转换为小写字母
stopwords_set = set(stopwords.words('english')) # 获取停用词集合
words = nltk.word_tokenize(text) # 分词
words = [w for w in words if w not in stopwords_set] # 去除停用词
text = ' '.join(words)
return text