Python爬虫之多进程爬取（以58同城二手市场为例）

2017-02-04 本文已影响1750人罗罗攀

今天以58同城的二手市场为例（也就是转转）给大家介绍一下大规模的结构数据怎么爬取。

分析

先看下转转的网页结构与我想爬取的数据：

类目

物品页

详细页

我的做法是先提取大类目的链接，然后进入爬取物品页的链接，进而爬取详细页的数据，总共建立了3个Python的文件，分别为channel_extract.py，page_spider.py，main.py

channel_extract.py

import requests
from lxml import etree

start_url = 'http://cs.58.com/sale.shtml'
url_host = 'http://cs.58.com'

def get_channel_urls(url):
    html = requests.get(url)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//div[@class="lbsear"]/div/ul/li')

    for info in infos:
        class_urls = info.xpath('ul/li/b/a/@href')
        for class_url in class_urls:
            print(url_host + class_url)



# get_channel_urls(start_url)

channel_list = '''
    http://cs.58.com/shouji/
    http://cs.58.com/tongxunyw/
    http://cs.58.com/danche/
    http://cs.58.com/fzixingche/
    http://cs.58.com/diandongche/
    http://cs.58.com/sanlunche/
    http://cs.58.com/peijianzhuangbei/
    http://cs.58.com/diannao/
    http://cs.58.com/bijiben/
    http://cs.58.com/pbdn/
    http://cs.58.com/diannaopeijian/
    http://cs.58.com/zhoubianshebei/
    http://cs.58.com/shuma/
    http://cs.58.com/shumaxiangji/
    http://cs.58.com/mpsanmpsi/
    http://cs.58.com/youxiji/
    http://cs.58.com/jiadian/
    http://cs.58.com/dianshiji/
    http://cs.58.com/ershoukongtiao/
    http://cs.58.com/xiyiji/
    http://cs.58.com/bingxiang/
    http://cs.58.com/binggui/
    http://cs.58.com/chuang/
    http://cs.58.com/ershoujiaju/
    http://cs.58.com/bangongshebei/
    http://cs.58.com/diannaohaocai/
    http://cs.58.com/bangongjiaju/
    http://cs.58.com/ershoushebei/
    http://cs.58.com/yingyou/
    http://cs.58.com/yingeryongpin/
    http://cs.58.com/muyingweiyang/
    http://cs.58.com/muyingtongchuang/
    http://cs.58.com/yunfuyongpin/
    http://cs.58.com/fushi/
    http://cs.58.com/nanzhuang/
    http://cs.58.com/fsxiemao/
    http://cs.58.com/xiangbao/
    http://cs.58.com/meirong/
    http://cs.58.com/yishu/
    http://cs.58.com/shufahuihua/
    http://cs.58.com/zhubaoshipin/
    http://cs.58.com/yuqi/
    http://cs.58.com/tushu/
    http://cs.58.com/tushubook/
    http://cs.58.com/wenti/
    http://cs.58.com/yundongfushi/
    http://cs.58.com/jianshenqixie/
    http://cs.58.com/huju/
    http://cs.58.com/qiulei/
    http://cs.58.com/yueqi/
    http://cs.58.com/chengren/
    http://cs.58.com/nvyongpin/
    http://cs.58.com/qinglvqingqu/
    http://cs.58.com/qingquneiyi/
    http://cs.58.com/chengren/
    http://cs.58.com/xiaoyuan/
    http://cs.58.com/ershouqiugou/
    http://cs.58.com/tiaozao/
'''

爬取类目链接比较简单，在这里就不多讲，然后把爬取的类目链接赋值给channel_list变量（具体原因见下面说明）

page_spider.py

import requests
from lxml import etree
import time
import pymongo
# import random

client = pymongo.MongoClient('localhost', 27017)
test = client['test']
tongcheng = test['tongcheng']

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Connection':'keep-alive'
}

# proxy_list = [
#     'http://218.56.132.158',
#     'http://115.47.44.102',
#     'http://118.144.149.200',
#     'http://222.223.239.135',
#     'http://123.234.219.133'
# ]
# proxy_ip = random.choice(proxy_list)
# proxies = {'http':proxy_ip}

def get_links_from(channel,pages):
    list_view = '{}pn{}/'.format(channel,str(pages))
    try:
        html = requests.get(list_view,headers=headers)
        time.sleep(2)
        selector = etree.HTML(html.text)
        if selector.xpath('//tr'):
            infos = selector.xpath('//tr')
            for info in infos:
                if info.xpath('td[2]/a/@href'):
                    url = info.xpath('td[2]/a/@href')[0]
                    get_info(url)
                else:pass
        else:
            pass
    except requests.exceptions.ConnectionError:pass

def get_info(url):
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    try:
        title = selector.xpath('//h1/text()')[0]
        if selector.xpath('//span[@class="price_now"]/i/text()'):
            price = selector.xpath('//span[@class="price_now"]/i/text()')[0]
        else:
            price = "无"
        if selector.xpath('//div[@class="palce_li"]/span/i/text()'):
            area = selector.xpath('//div[@class="palce_li"]/span/i/text()')[0]
        else:
            area = "无"
        view = selector.xpath('//p/span[1]/text()')[0]
        if selector.xpath('//p/span[2]/text()'):
            want = selector.xpath('//p/span[2]/text()')[0]
        else:
            want = "无"
        info = {
            'tittle':title,
            'price':price,
            'area':area,
            'view':view,
            'want':want,
            'url':url
        }
        tongcheng.insert_one(info)

    except IndexError:
        pass

1 try多用，避免大型错误，我也是通过几次调试加上的
2 大规模数据爬取加请求头，代理池，以及请求延迟和断点续爬；请求头不用多说，平时爬取数据都尽量加上，请求头的作用在于Python伪装浏览器爬取；大规模数据爬取数据，时间久了，很容易被ban ip，最好的方法是设置代理池，我这上面有写做法（但由于找的ip不好用，几个ip都不行，后面放弃了）；请求延迟是尽量减少访问网站的频次；断点续爬就是说爬取一半由于各种原因停止了，我们需要它接着爬取而不是重新开始（下次有时间给大家介绍下）

main.py

import sys
sys.path.append("..")
from multiprocessing import Pool
from channel_extract import channel_list
from page_spider import get_links_from

def get_all_links_from(channel):
    for num in range(1,101):
        get_links_from(channel,num)

if __name__ == '__main__':

    pool = Pool()
    pool.map(get_all_links_from,channel_list.split())

这就是多进程了！！！用法简单，不多说

结果

为了方便看爬取情况，又建立了一个counts.py

import sys
sys.path.append("..")
import time
from page_spider import tongcheng

while True:
    print(tongcheng.find().count())
    time.sleep(5)

在cmd中运行，现在已经爬取了10w+，还是爬取中。。。。

结果

Python爬虫之多进程爬取（以58同城二手市场为例）

分析

channel_extract.py

page_spider.py

main.py

结果

猜你喜欢

热点阅读