使用Scrapy框架 爬取简书首页文章(XPath)

2018-12-20  本文已影响22人  z小志

1、简历Scrapy工程

scrapy startproject jianshu

2、建立Spider

#cd 到jianshu目录
scrapy genspider jianshuspider www.jianshu.com

3、解析Response(使用XPath)

    def parse(self, response):
        lis = response.xpath('//ul[@class="note-list"]/li')
        for li in lis:
            item = JianshuItem()
            try:
                item['author'] = li.xpath('.//div[@class="meta"]/a/text()').extract_first()
            except Exception as e:
                print(e)
                item['author'] = ''
            try:
                item['author_home'] = self.baseurl + li.xpath('.//div[@class="meta"]/a/@href').extract_first()
            except Exception as e:
                print(e)
                item['author_home'] = ''
            try:
                item['title'] = li.xpath('.//div[@class="content"]/a/text()').extract_first()
                print(item['title'])
            except Exception as e:
                print(e)
                item['title'] = ''
            try:
                item['content'] = li.xpath('.//div[@class="content"]/p/text()').extract_first()
            except Exception as e:
                print(e)
                item['content'] = ''
            try:
                item['comment_count'] = li.xpath('.//div[@class="meta"]/a[2]/text()').extract()[1].strip()
            except Exception as e:
                print(e)
                item['comment_count'] = ''
            try:
                item['like_count'] = li.xpath('.//div[@class="meta"]/span/text()').extract_first().strip()
            except Exception as e:
                print(e)
                item['like_count'] = ''
            try:
                item['detail_link'] = self.baseurl + li.xpath('.//div[@class="content"]/a/@href').extract_first()
            except Exception as e:
                print(e)
                item['detail_link'] = ''
            yield item

第一页的数据解析比较简单,第二页的数据怎么加载呢?
查看XHR发现请求链接后面拼接了许多参数&seen_snote_ids[]=xxxxx,分析发下这是第一页的文章id,而第三页第四页都是拼接的前面的文章id
so:

#定义一个params变量  
        for li in lis:
            item = JianshuItem()
            self.params.append('seen_snote_ids[]=' + li.xpath('@data-note-id').extract_first())
            .....
            .....
       self.page += 1
       url = 'https://www.jianshu.com/?' + '&'.join(self.params) + "&page={}".format(self.page)
       yield  scrapy.Request(url=url,callback=self.parse)

4、问题 在爬取的时候你会发现 第二页 以及以后的 数据是相同的??!!!

查找资料发现实cookie问题 必须是登录过后的cookie才可以。(没发现什么好办法 ,有知道的兄弟可以说下)
so:

#cookie转换字典
class transCookie:
    def __init__(self, cookie):
        self.cookie = cookie

    def stringToDict(self):
        '''
        将从浏览器上Copy来的cookie字符串转化为Scrapy能使用的Dict
        :return:
        '''
        itemDict = {}
        items = self.cookie.split(';')
        for item in items:
            key = item.split('=')[0].replace(' ', '')
            value = item.split('=')[1]
            itemDict[key] = value
        return itemDict

class JianshuspiderSpider(scrapy.Spider):
    def start_requests(self):
        cookies = '__yadk_uid=r2YeqY2ZLEnkL8W2oxS8nZ0Ob98dyXGj; read_mode=day; default_font=font2; locale=zh-CN; Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1545285216,1545285885,1545285899,1545285983; remember_user_token=W1szMzk4MjQyXSwiJDJhJDEwJGNNTk9Pby85V1NWWVlXR1JEeC5MdU8iLCIxNTQ1Mjg2Nzk0LjA5NzA5MDUiXQ%3D%3D--89eb1bc31563fc8154a4cb02d6a3d81bab13932e; _m7e_session=4fbd7110d59c196780fc78419b536061; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%223398242%22%2C%22%24device_id%22%3A%22163fbd945388c5-023accccefcded-17366952-3686400-163fbd945396b3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22desktop%22%2C%22%24latest_utm_medium%22%3A%22index-banner-s%22%2C%22%24latest_utm_campaign%22%3A%22maleskine%22%2C%22%24latest_utm_content%22%3A%22note%22%7D%2C%22first_id%22%3A%22163fbd945388c5-023accccefcded-17366952-3686400-163fbd945396b3%22%7D; Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1545286804'
        yield  Request('https://www.jianshu.com/',cookies=transCookie(cookies).stringToDict(),callback=self.parse)

这样就能正常爬取了。

5、存储(csv or mogodb)

import pymongo

class JianshuPipeline(object):
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        name = item.__class__.__name__
        self.db[name].insert(dict(item))
        return item

    def close_spider(self, spider):
        self.client.close()

import csv
class Pipline_toCSV(object):

    def __init__(self):
        store_file = 'main.csv'
        self.file = open(store_file,'a',encoding='utf-8')
        self.writer = csv.DictWriter(self.file,fieldnames=['author','author_home','title','content','comment_count','like_count','detail_link'])

    def process_item(self, item, spider):
     # 判断字段值不为空再写入文件
        try:
            self.writer.writerow(item)
        except Exception as e:
            print(e)
            print('写入失败')

    def close_spider(self, spider):
        self.file.close()

至于setting的设置就不详细写了

结语:研究了好久终于好了,新手勿喷,欢迎一起学习。。大佬也可以指导下 。谢谢。

遗留问题:可不可以不登录获取数据(Selenium是肯定可以的 但是效率低 使用Scrapy框架 爬取简书首页文章(Selenium)

上一篇下一篇

猜你喜欢

热点阅读