Python爬虫Scrapy我的Python自学之路

Python爬虫之多进程爬取(以58同城二手市场为例)

2017-02-04  本文已影响1750人  罗罗攀

今天以58同城的二手市场为例(也就是转转)给大家介绍一下大规模的结构数据怎么爬取。

分析

先看下转转的网页结构与我想爬取的数据:


类目
物品页
详细页

我的做法是先提取大类目的链接,然后进入爬取物品页的链接,进而爬取详细页的数据,总共建立了3个Python的文件,分别为channel_extract.py,page_spider.py,main.py

channel_extract.py

import requests
from lxml import etree

start_url = 'http://cs.58.com/sale.shtml'
url_host = 'http://cs.58.com'

def get_channel_urls(url):
    html = requests.get(url)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//div[@class="lbsear"]/div/ul/li')

    for info in infos:
        class_urls = info.xpath('ul/li/b/a/@href')
        for class_url in class_urls:
            print(url_host + class_url)



# get_channel_urls(start_url)

channel_list = '''
    http://cs.58.com/shouji/
    http://cs.58.com/tongxunyw/
    http://cs.58.com/danche/
    http://cs.58.com/fzixingche/
    http://cs.58.com/diandongche/
    http://cs.58.com/sanlunche/
    http://cs.58.com/peijianzhuangbei/
    http://cs.58.com/diannao/
    http://cs.58.com/bijiben/
    http://cs.58.com/pbdn/
    http://cs.58.com/diannaopeijian/
    http://cs.58.com/zhoubianshebei/
    http://cs.58.com/shuma/
    http://cs.58.com/shumaxiangji/
    http://cs.58.com/mpsanmpsi/
    http://cs.58.com/youxiji/
    http://cs.58.com/jiadian/
    http://cs.58.com/dianshiji/
    http://cs.58.com/ershoukongtiao/
    http://cs.58.com/xiyiji/
    http://cs.58.com/bingxiang/
    http://cs.58.com/binggui/
    http://cs.58.com/chuang/
    http://cs.58.com/ershoujiaju/
    http://cs.58.com/bangongshebei/
    http://cs.58.com/diannaohaocai/
    http://cs.58.com/bangongjiaju/
    http://cs.58.com/ershoushebei/
    http://cs.58.com/yingyou/
    http://cs.58.com/yingeryongpin/
    http://cs.58.com/muyingweiyang/
    http://cs.58.com/muyingtongchuang/
    http://cs.58.com/yunfuyongpin/
    http://cs.58.com/fushi/
    http://cs.58.com/nanzhuang/
    http://cs.58.com/fsxiemao/
    http://cs.58.com/xiangbao/
    http://cs.58.com/meirong/
    http://cs.58.com/yishu/
    http://cs.58.com/shufahuihua/
    http://cs.58.com/zhubaoshipin/
    http://cs.58.com/yuqi/
    http://cs.58.com/tushu/
    http://cs.58.com/tushubook/
    http://cs.58.com/wenti/
    http://cs.58.com/yundongfushi/
    http://cs.58.com/jianshenqixie/
    http://cs.58.com/huju/
    http://cs.58.com/qiulei/
    http://cs.58.com/yueqi/
    http://cs.58.com/chengren/
    http://cs.58.com/nvyongpin/
    http://cs.58.com/qinglvqingqu/
    http://cs.58.com/qingquneiyi/
    http://cs.58.com/chengren/
    http://cs.58.com/xiaoyuan/
    http://cs.58.com/ershouqiugou/
    http://cs.58.com/tiaozao/
'''

爬取类目链接比较简单,在这里就不多讲,然后把爬取的类目链接赋值给channel_list变量(具体原因见下面说明)

page_spider.py

import requests
from lxml import etree
import time
import pymongo
# import random

client = pymongo.MongoClient('localhost', 27017)
test = client['test']
tongcheng = test['tongcheng']

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Connection':'keep-alive'
}

# proxy_list = [
#     'http://218.56.132.158',
#     'http://115.47.44.102',
#     'http://118.144.149.200',
#     'http://222.223.239.135',
#     'http://123.234.219.133'
# ]
# proxy_ip = random.choice(proxy_list)
# proxies = {'http':proxy_ip}

def get_links_from(channel,pages):
    list_view = '{}pn{}/'.format(channel,str(pages))
    try:
        html = requests.get(list_view,headers=headers)
        time.sleep(2)
        selector = etree.HTML(html.text)
        if selector.xpath('//tr'):
            infos = selector.xpath('//tr')
            for info in infos:
                if info.xpath('td[2]/a/@href'):
                    url = info.xpath('td[2]/a/@href')[0]
                    get_info(url)
                else:pass
        else:
            pass
    except requests.exceptions.ConnectionError:pass

def get_info(url):
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    try:
        title = selector.xpath('//h1/text()')[0]
        if selector.xpath('//span[@class="price_now"]/i/text()'):
            price = selector.xpath('//span[@class="price_now"]/i/text()')[0]
        else:
            price = "无"
        if selector.xpath('//div[@class="palce_li"]/span/i/text()'):
            area = selector.xpath('//div[@class="palce_li"]/span/i/text()')[0]
        else:
            area = "无"
        view = selector.xpath('//p/span[1]/text()')[0]
        if selector.xpath('//p/span[2]/text()'):
            want = selector.xpath('//p/span[2]/text()')[0]
        else:
            want = "无"
        info = {
            'tittle':title,
            'price':price,
            'area':area,
            'view':view,
            'want':want,
            'url':url
        }
        tongcheng.insert_one(info)

    except IndexError:
        pass

1 try多用,避免大型错误,我也是通过几次调试加上的
2 大规模数据爬取加请求头,代理池,以及请求延迟和断点续爬;请求头不用多说,平时爬取数据都尽量加上,请求头的作用在于Python伪装浏览器爬取;大规模数据爬取数据,时间久了,很容易被ban ip,最好的方法是设置代理池,我这上面有写做法(但由于找的ip不好用,几个ip都不行,后面放弃了);请求延迟是尽量减少访问网站的频次;断点续爬就是说爬取一半由于各种原因停止了,我们需要它接着爬取而不是重新开始(下次有时间给大家介绍下)

main.py

import sys
sys.path.append("..")
from multiprocessing import Pool
from channel_extract import channel_list
from page_spider import get_links_from

def get_all_links_from(channel):
    for num in range(1,101):
        get_links_from(channel,num)

if __name__ == '__main__':

    pool = Pool()
    pool.map(get_all_links_from,channel_list.split())

这就是多进程了!!!用法简单,不多说

结果

为了方便看爬取情况,又建立了一个counts.py

import sys
sys.path.append("..")
import time
from page_spider import tongcheng

while True:
    print(tongcheng.find().count())
    time.sleep(5)

在cmd中运行,现在已经爬取了10w+,还是爬取中。。。。


结果
上一篇下一篇

猜你喜欢

热点阅读