使用Scrapy框架 豆瓣图书 与 评论 (CSS)

2018-12-28  本文已影响17人  z小志
    #爬取图书
    def parse(self, response):
        self.logger.debug(response)
        try:
            lis = response.css('ul.subject-list .subject-item')
            self.logger.debug(lis)
            for li in lis:
                item = DoubanItem()

                name = li.css('.info h2 a::attr(title)').extract_first()
                item['name'] = name.strip() if name else ''

                img = li.css('.pic img::attr(src)').extract_first()
                item['img'] = img.strip() if img else ''

                public_info = li.css('.pub::text').extract_first()
                item['public_info'] = public_info.strip() if public_info else ''

                des = li.css('.info p::text').extract_first()
                item['des'] = des.strip() if des else ''

                detail_url = li.css('.info h2 a::attr(href)').extract_first()
                item['detail_url'] = detail_url.strip() if detail_url else ''

                id = self.txt_wrap_by('subject/','/',detail_url)
                item['id'] = id if id else ''

                score = li.css('.rating_nums::text').extract_first()
                item['score'] = score.strip() if score else ''

                comment = li.css('.pl::text').extract_first()
                item['comment'] = comment.strip() if comment else ''
                reviews_url = item['detail_url'] + '/reviews'
                if reviews_url:
                    print('next_url=' + reviews_url)
                    yield scrapy.Request(url=reviews_url, callback=self.parse_reviews)
                yield item
            next_url = response.css('.next link::attr(href)').extract_first()
            if next_url:
                print('next_url=' + next_url)
                yield  scrapy.Request(url=response.urljoin(next_url.strip()),callback=self.parse)
        except Exception as e:
                print(e)
                print("爬取结束")


    #爬取评论信息
    def parse_reviews(self,response):
       divs = response.css('div.review-list div.main.review-item')
        print(divs)
        for div in divs:
            item = DoubanReviewsItem()
            id = self.txt_wrap_by('subject/', '/', response.url)
            item['id'] = id if id else ''
            name = div.css('.name::text').extract_first()
            item['name'] = name.strip() if name else ''
            content = div.css('.short-content::text').extract_first()
            item['content'] = content.strip() if content else ''
            yield  item

爬着爬着发现ip被禁了 ,然后写了个中间件

class ProxyMiddleware(object):
    def __init__(self,settings):
        self.logger = logging.getLogger(__name__)
        self.is_first = False

    def process_request(self,request,spider):    
        #第一次请求 或者 需要重试的时候 切换ip
        if request.meta.get('retry_times') or not self.is_first:
            self.logger.debug('使用代理 ')
            self.is_first = True
            proxy = Proxy().get_random_proxy()
            if proxy:
                uri = 'https://{proxy}'.format(proxy=proxy)
                self.logger.debug('使用代理 ' + proxy)
                request.meta['proxy'] = uri

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            settings=crawler.settings
        )


#从本地搭建的代理池获取ip
from douban.settings import *
import requests

class Proxy(object):
    def __init__(self):
        self.proxy_url = PROXY_URL

    def get_random_proxy(self):
        try:
            response = requests.get(self.proxy_url)
            if response.status_code == 200:
                proxy = response.text
                return  proxy
        except requests.ConnectionError:
            return  False

上一篇下一篇

猜你喜欢

热点阅读