使用Scrapy框架 爬取简书首页文章(XPath)
2018-12-20 本文已影响22人
z小志
1、简历Scrapy工程
scrapy startproject jianshu
2、建立Spider
#cd 到jianshu目录
scrapy genspider jianshuspider www.jianshu.com
3、解析Response(使用XPath)
def parse(self, response):
lis = response.xpath('//ul[@class="note-list"]/li')
for li in lis:
item = JianshuItem()
try:
item['author'] = li.xpath('.//div[@class="meta"]/a/text()').extract_first()
except Exception as e:
print(e)
item['author'] = ''
try:
item['author_home'] = self.baseurl + li.xpath('.//div[@class="meta"]/a/@href').extract_first()
except Exception as e:
print(e)
item['author_home'] = ''
try:
item['title'] = li.xpath('.//div[@class="content"]/a/text()').extract_first()
print(item['title'])
except Exception as e:
print(e)
item['title'] = ''
try:
item['content'] = li.xpath('.//div[@class="content"]/p/text()').extract_first()
except Exception as e:
print(e)
item['content'] = ''
try:
item['comment_count'] = li.xpath('.//div[@class="meta"]/a[2]/text()').extract()[1].strip()
except Exception as e:
print(e)
item['comment_count'] = ''
try:
item['like_count'] = li.xpath('.//div[@class="meta"]/span/text()').extract_first().strip()
except Exception as e:
print(e)
item['like_count'] = ''
try:
item['detail_link'] = self.baseurl + li.xpath('.//div[@class="content"]/a/@href').extract_first()
except Exception as e:
print(e)
item['detail_link'] = ''
yield item
第一页的数据解析比较简单,第二页的数据怎么加载呢?
查看XHR发现请求链接后面拼接了许多参数&seen_snote_ids[]=xxxxx,分析发下这是第一页的文章id,而第三页第四页都是拼接的前面的文章id
so:
#定义一个params变量
for li in lis:
item = JianshuItem()
self.params.append('seen_snote_ids[]=' + li.xpath('@data-note-id').extract_first())
.....
.....
self.page += 1
url = 'https://www.jianshu.com/?' + '&'.join(self.params) + "&page={}".format(self.page)
yield scrapy.Request(url=url,callback=self.parse)
4、问题 在爬取的时候你会发现 第二页 以及以后的 数据是相同的??!!!
查找资料发现实cookie问题 必须是登录过后的cookie才可以。(没发现什么好办法 ,有知道的兄弟可以说下)
so:
#cookie转换字典
class transCookie:
def __init__(self, cookie):
self.cookie = cookie
def stringToDict(self):
'''
将从浏览器上Copy来的cookie字符串转化为Scrapy能使用的Dict
:return:
'''
itemDict = {}
items = self.cookie.split(';')
for item in items:
key = item.split('=')[0].replace(' ', '')
value = item.split('=')[1]
itemDict[key] = value
return itemDict
class JianshuspiderSpider(scrapy.Spider):
def start_requests(self):
cookies = '__yadk_uid=r2YeqY2ZLEnkL8W2oxS8nZ0Ob98dyXGj; read_mode=day; default_font=font2; locale=zh-CN; Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1545285216,1545285885,1545285899,1545285983; remember_user_token=W1szMzk4MjQyXSwiJDJhJDEwJGNNTk9Pby85V1NWWVlXR1JEeC5MdU8iLCIxNTQ1Mjg2Nzk0LjA5NzA5MDUiXQ%3D%3D--89eb1bc31563fc8154a4cb02d6a3d81bab13932e; _m7e_session=4fbd7110d59c196780fc78419b536061; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%223398242%22%2C%22%24device_id%22%3A%22163fbd945388c5-023accccefcded-17366952-3686400-163fbd945396b3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22desktop%22%2C%22%24latest_utm_medium%22%3A%22index-banner-s%22%2C%22%24latest_utm_campaign%22%3A%22maleskine%22%2C%22%24latest_utm_content%22%3A%22note%22%7D%2C%22first_id%22%3A%22163fbd945388c5-023accccefcded-17366952-3686400-163fbd945396b3%22%7D; Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1545286804'
yield Request('https://www.jianshu.com/',cookies=transCookie(cookies).stringToDict(),callback=self.parse)
这样就能正常爬取了。
5、存储(csv or mogodb)
import pymongo
class JianshuPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
import csv
class Pipline_toCSV(object):
def __init__(self):
store_file = 'main.csv'
self.file = open(store_file,'a',encoding='utf-8')
self.writer = csv.DictWriter(self.file,fieldnames=['author','author_home','title','content','comment_count','like_count','detail_link'])
def process_item(self, item, spider):
# 判断字段值不为空再写入文件
try:
self.writer.writerow(item)
except Exception as e:
print(e)
print('写入失败')
def close_spider(self, spider):
self.file.close()
至于setting的设置就不详细写了