爬虫基础_03——xpath
2017-06-11 本文已影响139人
王小鱼鱻
今天是利用xpath爬取网址: 简书首页
包括:标题,作者,发表时间,内容,阅读量,评论数,点赞数,打赏数,所投专题
主要思想:利用xpath获取网页中的数据,然后存到本地的csv
下面了解一下xpath的用法
首先必须要导入 lxml 库
Python爬虫利器三之Xpath语法与lxml库的用法
1、首先是爬的第一页的数据
运行代码:
#coding: utf-8
import requests
from lxml import etree
import csv
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
header = {'User-Agent': user_agent}
html = requests.get('http://www.jianshu.com/', headers = header).content
selector = etree.HTML(html)
infos = selector.xpath('//div[@id="list-container"]/ul/li/div')
a = []
# 第一页数据的匹配
for info in infos:
titles = info.xpath('a/text()')[0]
authors = info.xpath('div[1]/div/a/text()')[0]
times = info.xpath('div[1]/div/span/@data-shared-at')[0]
contents = info.xpath('p/text()')[0].strip()
try:
read_counts = info.xpath('div[2]/a[2]/text()')[1].strip()
except:
read_counts = '0'
try:
comment_counts = info.xpath('div[2]/a[3]/text()')[1].strip()
except:
comment_counts = '0'
try:
vote_counts = info.xpath('//div/div[2]/span[1]/text()')[0].strip()
except:
vote_counts = '0'
try:
reward_counts = info.xpath('div[2]/span[2]/text()')[0]
except:
reward_counts = '0'
try:
subjects = info.xpath('div[2]/a[1]/text()')[0]
except:
subjects = '暂未收录专题'
#print(titles, authors, times, contents, read_counts, comment_counts, vote_counts, reward_counts, subjects)
data = {
'文章标题': titles,
'作者': authors,
'发表时间': times,
'内容': contents,
'阅读量': read_counts,
'评论数': comment_counts,
'点赞数': vote_counts,
'打赏数': reward_counts,
'主题': subjects,
}
a.append(data)
#print(a)
#把爬到的数据存储到csv
csv_name = ['文章标题', '作者', '发表时间', '内容', '阅读量', '评论数', '点赞数', '打赏数', '主题']
with open('jianshu_xpath.csv', 'w', newline = '',encoding='utf-8')as csvfile:
write = csv.DictWriter(csvfile, fieldnames = csv_name)
write.writeheader()
write.writerows(a)
csvfile.close()
运行结果:
第一页的文章信息.png第一页比较容易,主要是每个数据爬取路径的选取,还有循环点的选取;
2.爬取简书首页前15页的数据
a、首先要分析一下每页的加载方式,通过点击更多,可以发现url并没有变化,所以是异步加载,下面要抓包分析一下后面每页请求的url有什么共同点。
上面每页的id都可以在上一页找到,而且是会累加的,
Paste_Image.png Paste_Image.png具体的分析可以参看liang的文章http://www.jianshu.com/p/9afef50a8cc7,写的很详细,就不多说了;
运行代码:
#coding: utf-8
import requests
from lxml import etree
import csv
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
header = {'User-Agent': user_agent,
'cookie': 'remember_user_token=W1szNjE3MDgyXSwiJDJhJDEwJDMuQTVNeHVYTkUubFQvc1ZPM0V5UGUiLCIxNDk3MTcyNDA2Ljk2ODQ2NjMiXQ%3D%3D--56522c2190961ce284b1fe108b267ae0cd5bf32a; _session_id=YVRyNm5tREZkK1JwUGFZVDNLdjJoL25zVS8yMjBnOGlwSnpITEE0U0drZHhxSU5XQVpYM2RpSmY5WU44WGJWeHVZV3d1Z1lINHR0aXhUQzR6Z1pMUW52RGI5UHpPRVFJRk5HeUcybEhwc21raVBqbk9uZmhjN0xQWmc2ZFMreXhGOHlhbmJiSDBHQUVsUTNmN2p0M2Y2TjgrWnBjVis4ODE4UXRhWmJ6K2VETHJlakhHbEl0djhDNDRKYVZEWndENjhrSGIvZ1crNC9NNnh4UmlpOVFPNWxGWm1PUmxhQk1sdnk2OXozQVZwU1hXVm9lMTU3WkUyUkhialZKZ2MvVkFOYk1tOUw3STkrMGNFWXVIaklDNlNpTmkrVi9iNDIrRzBDU0ZNNnc3b3I2bkhvLzFCSCsvTWdsUDExdEZBa0RsU3RqTURWcjdNU1VOTGVBeTk2MERMUXN1UlZqUytuYXdWdnI4cTkxTjFPbG5Ia3IzK3NXcVNpMENwWVZPSUV3TWU4TENaRWUva24ybXMzSE9MTVZRSEdrVDJhMzhzM05RUnBoMk8xU1FHYz0tLTFxUnlXWTZLQXM4dW9EQmVxMHZwRWc9PQ%3D%3D--6fb5c178053ee287201628ee5d7b2b61c170e994'}
a = []
params = []
#获取每一页的url
for p in range(1,16):
url_data = '&'.join(params)
url = 'http://www.jianshu.com/?' + url_data + '&page={}'.format(p)
#获取每页的数据
html = requests.get(url, headers = header).text
selector = etree.HTML(html)
li_pages = selector.xpath('//div[@id="list-container"]/ul/li')
for li_page in li_pages:
li_page = 'seen_snote_ids[]=' + li_page.xpath('@data-note-id')[0]
params.append(li_page)
#print(len(params))
infos = selector.xpath('//div[@id="list-container"]/ul/li/div')
for info in infos:
titles = info.xpath('a/text()')[0]
authors = info.xpath('div[1]/div/a/text()')[0]
times = info.xpath('div[1]/div/span/@data-shared-at')[0]
contents = info.xpath('p/text()')[0].strip()
try:
read_counts = info.xpath('div[2]/a[2]/text()')[1].strip()
except:
read_counts = '0'
try:
comment_counts = info.xpath('div[2]/a[3]/text()')[1].strip()
except:
comment_counts = '0'
try:
vote_counts = info.xpath('//div/div[2]/span[1]/text()')[0].strip()
except:
vote_counts = '0'
try:
reward_counts = info.xpath('div[2]/span[2]/text()')[0]
except:
reward_counts = '0'
try:
subjects = info.xpath('div[2]/a[@class="collection-tag"]/text()')[0]
except:
subjects = '暂未收录专题'
#print(titles, authors, times, contents, read_counts, comment_counts, vote_counts, reward_counts, subjects)
data = {
'文章标题': titles,
'作者': authors,
'发表时间': times,
'内容': contents,
'阅读量': read_counts,
'评论数': comment_counts,
'点赞数': vote_counts,
'打赏数': reward_counts,
'主题': subjects,
}
a.append(data)
#存储数据
csv_name = ['文章标题', '作者', '发表时间', '内容', '阅读量', '评论数', '点赞数', '打赏数', '主题']
with open('jianshu_xpath2.csv', 'w', newline='', encoding='utf-8')as csvfile:
write = csv.DictWriter(csvfile, fieldnames=csv_name)
write.writeheader()
write.writerows(a)
运行结果:
简书首页所有的文章信息.png注意这里取cookie的时候一定要登录后再取cookie,否则只能爬到重复第一页的数据;
最后再把整个代码整理封装一下,稍微好看一点:
#coding: utf-8
import requests
from lxml import etree
import csv
class Jianshu():
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
header = {'User-Agent': user_agent,
'cookie': 'remember_user_token=W1szNjE3MDgyXSwiJDJhJDEwJDMuQTVNeHVYTkUubFQvc1ZPM0V5UGUiLCIxNDk3MTcyNDA2Ljk2ODQ2NjMiXQ%3D%3D--56522c2190961ce284b1fe108b267ae0cd5bf32a; _session_id=YVRyNm5tREZkK1JwUGFZVDNLdjJoL25zVS8yMjBnOGlwSnpITEE0U0drZHhxSU5XQVpYM2RpSmY5WU44WGJWeHVZV3d1Z1lINHR0aXhUQzR6Z1pMUW52RGI5UHpPRVFJRk5HeUcybEhwc21raVBqbk9uZmhjN0xQWmc2ZFMreXhGOHlhbmJiSDBHQUVsUTNmN2p0M2Y2TjgrWnBjVis4ODE4UXRhWmJ6K2VETHJlakhHbEl0djhDNDRKYVZEWndENjhrSGIvZ1crNC9NNnh4UmlpOVFPNWxGWm1PUmxhQk1sdnk2OXozQVZwU1hXVm9lMTU3WkUyUkhialZKZ2MvVkFOYk1tOUw3STkrMGNFWXVIaklDNlNpTmkrVi9iNDIrRzBDU0ZNNnc3b3I2bkhvLzFCSCsvTWdsUDExdEZBa0RsU3RqTURWcjdNU1VOTGVBeTk2MERMUXN1UlZqUytuYXdWdnI4cTkxTjFPbG5Ia3IzK3NXcVNpMENwWVZPSUV3TWU4TENaRWUva24ybXMzSE9MTVZRSEdrVDJhMzhzM05RUnBoMk8xU1FHYz0tLTFxUnlXWTZLQXM4dW9EQmVxMHZwRWc9PQ%3D%3D--6fb5c178053ee287201628ee5d7b2b61c170e994'}
a = []
params = []
def __init__(self):
pass
#获取每一页的url
def total_page(self):
for p in range(1,16):
url_data = '&'.join(self.params)
url = 'http://www.jianshu.com/?' + url_data + '&page={}'.format(p)
self.get_data(url)
#获取每页的数据
def get_data(self, url):
html = requests.get(url, headers = self.header).text
selector = etree.HTML(html)
li_pages = selector.xpath('//*[@id="list-container"]/ul/li')
#print(li_pages)
for info in li_pages:
info = 'seen_snote_ids%5B%5D=' + info.xpath('@data-note-id')[0]
self.params.append(info)
infos = selector.xpath('//div[@id="list-container"]/ul/li/div')
for info in infos:
titles = info.xpath('a/text()')[0]
authors = info.xpath('div[1]/div/a/text()')[0]
times = info.xpath('div[1]/div/span/@data-shared-at')[0]
contents = info.xpath('p/text()')[0].strip()
try:
read_counts = info.xpath('div[2]/a[2]/text()')[1].strip()
except:
read_counts = '0'
try:
comment_counts = info.xpath('div[2]/a[3]/text()')[1].strip()
except:
comment_counts = '0'
try:
vote_counts = info.xpath('//div/div[2]/span[1]/text()')[0].strip()
except:
vote_counts = '0'
try:
reward_counts = info.xpath('div[2]/span[2]/text()')[0]
except:
reward_counts = '0'
try:
subjects = info.xpath('div[2]/a[@class="collection-tag"]/text()')[0]
except:
subjects = '暂未收录专题'
#print(titles, authors, times, contents, read_counts, comment_counts, vote_counts, reward_counts, subjects)
data = {
'文章标题': titles,
'作者': authors,
'发表时间': times,
'内容': contents,
'阅读量': read_counts,
'评论数': comment_counts,
'点赞数': vote_counts,
'打赏数': reward_counts,
'主题': subjects,
}
self.a.append(data)
#print(self.a)
#存储数据
csv_name = ['文章标题', '作者', '发表时间', '内容', '阅读量', '评论数', '点赞数', '打赏数', '主题']
with open('jianshu_xpath2.csv', 'w', newline='', encoding='utf-8')as csvfile:
write = csv.DictWriter(csvfile, fieldnames=csv_name)
write.writeheader()
write.writerows(self.a)
if __name__ == '__main__':
jian = Jianshu()
jian.total_page()
小结:
1、这里用xpath爬取网页的内容,是不是很方便?
虽然用正则、BeautifulSoup和Xpath都可以获取网页的内容,但是要学会灵活应用,有时遇到某一种方法获取不到就要用另外的方法(比如正则,只要你的正则表达式没写错,基本都是可以获取网页数据)
2、这里爬取多页是通过自己手动分析网页加载方式去构造每页的url,然后爬取全部的数据;对于这种异步加载的网页,后面还会介绍其他的方法;