使用Scrapy框架 豆瓣图书 与 评论 (CSS)
2018-12-28 本文已影响17人
z小志
#爬取图书
def parse(self, response):
self.logger.debug(response)
try:
lis = response.css('ul.subject-list .subject-item')
self.logger.debug(lis)
for li in lis:
item = DoubanItem()
name = li.css('.info h2 a::attr(title)').extract_first()
item['name'] = name.strip() if name else ''
img = li.css('.pic img::attr(src)').extract_first()
item['img'] = img.strip() if img else ''
public_info = li.css('.pub::text').extract_first()
item['public_info'] = public_info.strip() if public_info else ''
des = li.css('.info p::text').extract_first()
item['des'] = des.strip() if des else ''
detail_url = li.css('.info h2 a::attr(href)').extract_first()
item['detail_url'] = detail_url.strip() if detail_url else ''
id = self.txt_wrap_by('subject/','/',detail_url)
item['id'] = id if id else ''
score = li.css('.rating_nums::text').extract_first()
item['score'] = score.strip() if score else ''
comment = li.css('.pl::text').extract_first()
item['comment'] = comment.strip() if comment else ''
reviews_url = item['detail_url'] + '/reviews'
if reviews_url:
print('next_url=' + reviews_url)
yield scrapy.Request(url=reviews_url, callback=self.parse_reviews)
yield item
next_url = response.css('.next link::attr(href)').extract_first()
if next_url:
print('next_url=' + next_url)
yield scrapy.Request(url=response.urljoin(next_url.strip()),callback=self.parse)
except Exception as e:
print(e)
print("爬取结束")
#爬取评论信息
def parse_reviews(self,response):
divs = response.css('div.review-list div.main.review-item')
print(divs)
for div in divs:
item = DoubanReviewsItem()
id = self.txt_wrap_by('subject/', '/', response.url)
item['id'] = id if id else ''
name = div.css('.name::text').extract_first()
item['name'] = name.strip() if name else ''
content = div.css('.short-content::text').extract_first()
item['content'] = content.strip() if content else ''
yield item
爬着爬着发现ip被禁了 ,然后写了个中间件
class ProxyMiddleware(object):
def __init__(self,settings):
self.logger = logging.getLogger(__name__)
self.is_first = False
def process_request(self,request,spider):
#第一次请求 或者 需要重试的时候 切换ip
if request.meta.get('retry_times') or not self.is_first:
self.logger.debug('使用代理 ')
self.is_first = True
proxy = Proxy().get_random_proxy()
if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 ' + proxy)
request.meta['proxy'] = uri
@classmethod
def from_crawler(cls, crawler):
return cls(
settings=crawler.settings
)
#从本地搭建的代理池获取ip
from douban.settings import *
import requests
class Proxy(object):
def __init__(self):
self.proxy_url = PROXY_URL
def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False