英文文本预处理
2019-06-12 本文已影响0人
_龙雀
import nltk
nltk.download('stopwords')
def text_to_list(text):
text = str(text)
text = text.lower()
# Clean the text
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r":", " : ", text)
text = re.sub(r"e - mail", "email", text)
text = text.split()
#去停用词
from nltk.corpus import stopwords
stops = list(stopwords.words('english'))
clean_text = []
for i in text:
if i in stops:
continue
clean_text.append(i)
return clean_text