Python 运维大数据 爬虫Python AI SqlPython语言与信息数据获取和机器学习

自动摘要实现

2017-12-03  本文已影响48人  羽恒

知识点普及

样例代码

# -*- coding:utf-8 -*-
import re
import os
import jieba
import codecs
import numpy 
import pandas
import os.path
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer

contents = []
# 创建语料库
with codecs.open('../data/input/war_cp.txt','r','utf-8') as f_in:
    contents.append(f_in.read())

corpos = pandas.DataFrame({
    'content':contents
})

#停用词导入
stopwords = pandas.read_csv(
    "../data/input/StopwordsCN.txt", 
    encoding='utf8', 
    index_col=False,
    quoting=3,
    sep="\t"
)

# 提取关键词,并移除停用词
countVectorizer = CountVectorizer(
    stop_words=list(stopwords['stopword'].values),
    min_df=0, token_pattern=r"\b\w+\b"
)
contents = []
summarys = []
filePaths = []
for index,row in corpos.iterrows():
    fileContent = row['content']
    #建立子语料库,以该文档和该文档的分句组成
    subCorpos = [fileContent]+ re.split(
        r'[。?!\n]\s*',
        fileContent
    )

    segments = []
    suitCorpos = []
    for content in subCorpos:
        segs = jieba.cut(content)
        segment = " ".join(segs)
        if len(segment.strip())>10:
            segments.append(segment)
            suitCorpos.append(content)
    # 生成特征矩阵textVector
    textVector = countVectorizer.fit_transform(segments)

   #计算分句和文章间的余弦相似度
    distance_matrix = pairwise_distances(
        textVector,
        metric="cosine"
    )
    # 对生成的距离矩阵排序
    sort = numpy.argsort(distance_matrix, axis=1)
    summary = pandas.Index(suitCorpos)[sort[0]].values[1]

    summarys.append(summary)    
    contents.append(fileContent)
    
summaryDF = pandas.DataFrame({
    'content': contents,
    'summary': summarys
})
print(summaryDF)


我是阿羽,一枚正在学习的搬砖小工,希望大家多多指教!

上一篇 下一篇

猜你喜欢

热点阅读