基于Python3.6爬虫 采集知网文献
2018-11-24 本文已影响86人
python与数据分析
最近因公司需求采集知网数据(标题、来源、关键字、作者、单位、分类号、摘要、相似文献这些字段),由于知网防爬太强,内容页链接加密,尝试了pyspider、scrapy、selenium,都无法进入内容页,直接跳转到知网首页。于是只好采用知网的一个接口进行采集:http://yuanjian.cnki.com.cn/,以下是两个网站关于“卷积神经网络”的期刊数据量相比如下图所示:
image.png
仔细观察会发现,该网站是post请求,重点是带参数请求。打开远见,搜索你想要的,按f2,查看参数里的表单数据。像我要采的是卷积神经网络,文章类型期刊,这里替换成你的参数就ok了。
formdata = {'Type': 1,
'Order': 1,
'Islegal': 'false',
'ArticleType': 1,
'Theme': '卷积神经网络',
'searchType': 'MulityTermsSearch',
'ParamIsNullOrEmpty': 'true',
'Page': i}
下面是实现代码:
# encoding='utf-8'
import json
import re
from lxml import etree
import requests
import codecs
class CNKI(object):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
cookies = {
'Cookie': 'Ecp_ClientId=4181108101501154830; cnkiUserKey=ec1ef785-3872-fac6-cad3-402229207945; UM_distinctid=166f12b44b1654-05e4c1a8d86edc-b79183d-1fa400-166f12b44b2ac8; KEYWORD=%E5%8D%B7%E7%A7%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C%24%E5%8D%B7%E7%A7%AF%20%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C; Ecp_IpLoginFail=1811121.119.135.10; amid=73b0014b-8b61-4e24-a333-8774cb4dd8bd; SID=110105; CNZZDATA1257838113=579682214-1541655561-http%253A%252F%252Fsearch.cnki.net%252F%7C1542070177'}
param = {
'Accept': 'text/html, */*; q=0.01',
'Accept - Encoding': 'gzip, deflate',
'Accept - Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep - alive',
'Content - Type': 'application / x - www - form - urlencoded;charset = UTF - 8',
'Host': 'yuanjian.cnki.net',
'Origin': 'http: // yuanjian.cnki.net',
'Referer': 'http: // yuanjian.cnki.net / Search / Result',
'X - Requested - With': 'XMLHttpRequest'}
def content(self):
li = []
# 遍历总页数
for j in range(1, 134):
for i in range(j, j + 1):
url = 'http://yuanjian.cnki.net/Search/Result'
print('当前页', i)
# post 传参
formdata = {'Type': 1,
'ArticleType': 1,
'Theme': '卷积神经网络',
'Page': i}
print(formdata)
try:
r = requests.post(url, data=formdata, headers=self.headers, cookies=self.cookies, params=self.param)
r.raise_for_status()
r.encoding = r.apparent_encoding
data = etree.HTML(r.text)
# 链接列表
url_list = data.xpath("//*[@id='article_result']/div/div/p[1]/a[1]/@href")
# 关键词列表
key_wordlist = []
all_items = data.xpath("//*[@id='article_result']/div/div")
for i in range(1, len(all_items) + 1):
key_word = data.xpath("//*[@id='article_result']/div/div[%s]/div[1]/p[1]/a/text()" % i)
key_words = ';'.join(key_word)
key_wordlist.append(key_words)
# 来源
source_items = data.xpath("//*[@id='article_result']/div/div")
for j in range(1, len(source_items) + 1):
sources = data.xpath("//*[@id='article_result']/div/div/p[3]/a[1]/span/text()")
for index, url in enumerate(url_list):
items = {}
try:
print('当前链接:', url)
content = requests.get(url, headers=self.headers)
contents = etree.HTML(content.text)
# 论文题目
title = contents.xpath("//h1[@class='xx_title']/text()")[0]
print('标题:', title)
# 来源
source = sources[index]
items['source'] = source
print('来源:', source)
items['title'] = title
# 关键字
each_key_words = key_wordlist[index]
print('关键字:', each_key_words)
items['keywordsEn'] = ''
items['keywordsCh'] = each_key_words
# 作者
author = contents.xpath("//*[@id='content']/div[2]/div[3]/a/text()")
items['author'] = author
print('作者:', author)
# 单位
unit = contents.xpath("//*[@id='content']/div[2]/div[5]/a[1]/text()")
units = ''.join(unit).strip(';')
items['unit'] = units
print('单位:', units)
# 分类号
classify = contents.xpath("//*[@id='content']/div[2]/div[5]/text()")[-1]
items['classify'] = classify
print('分类号:', classify)
# 摘要
abstract = contents.xpath("//div[@class='xx_font'][1]/text()")[1].strip()
print('摘要:', abstract)
items['abstractCh'] = abstract
items['abstractEn'] = ''
# 相似文献
similar = contents.xpath(
"//*[@id='xiangsi']/table[2]/tbody/tr[3]/td/table/tbody/tr/td/text()")
si = ''.join(similar).replace('\r\n', '').split('期')
po = []
for i in si:
sis = i + '期'
if len(sis) > 3:
po.append(sis)
items['similar_article'] = po
li.append(items)
except Exception as e:
print(e)
print(len(li))
except Exception as e:
print(e)
return li
if __name__ == '__main__':
con = CNKI()
items = con.content()
print(items)
try:
with codecs.open('./cnki_data.json', 'a+', encoding="utf-8") as fp:
for i in items:
fp.write(json.dumps(i, ensure_ascii=False) + ",\n")
except IOError as err:
print('error' + str(err))
finally:
fp.close()
完~
希望能帮助大家,小白一枚,如有不对请指正。