静态网页爬取多页

2017-07-11 本文已影响123人王小鱼鱻

爬取的网站：企业黄页

Paste_Image.png

这里我是抓取一个省的公司，按照这个分类抓取的，
省——市——分类——公司，其实方法不限的；
按照这个顺序一级一级往下获取所需的参数的，其实不是很难，就是数据太多了，爬的太慢，多进程和多线程的，自己还不是很懂，所以慢的很慢，有知道的各位，希望可以给个建议，最好有个最优解决的方法；
不说废话了，看下代码，求各位高手的指导一下：

import requests
from lxml import etree
import re
import math
import json
import pymongo
import multiprocessing


# 连接到Mongo
conn = pymongo.MongoClient(host = 'localhost', port = 27017)
# 选择或创建数据库
company = conn['company']
# 选择或创建数据集合
newsdata = company['infos']

class Company(multiprocessing.Process):
    company_data = []

    def __init__(self, interval):
        multiprocessing.Process.__init__(self)
        self.interval = interval

    #获取一个省的所有城市
    def get_city(self, province_url):
        global city_name
        html = requests.get(province_url).content
        selector = etree.HTML(html)
        city_infos = selector.xpath('//div[@class="contentBox"]')[0]
        city_names = city_infos.xpath('div[@class="cityBox"]/a/text()')
        city_halfurls = city_infos.xpath('div[@class="cityBox"]/a/@href')
        for city_name,city_halfurl in zip(city_names,city_halfurls):
            city_url = "http://www.socom.cn" + city_halfurl
            print(city_name)
            print(city_url)
            self.get_item(city_url)

    #获取公司的分类
    def get_item(self,city_url):
        global item_name
        html = requests.get(city_url).content
        selector = etree.HTML(html)
        city_infos = selector.xpath('//div[@class="contentBox"]')[1]
        item_names = city_infos.xpath('div[@class="provinceBox"]/a/text()')
        item_halfurls = city_infos.xpath('div[@class="provinceBox"]/a/@href')
        for item_name,item_halfurl in zip(item_names,item_halfurls):
            item_url = "http://www.socom.cn" + item_halfurl
            print(item_name)
            print(item_url)
            self.get_page(item_url)

    #获取公司每个分类的页数；
    def get_page(self, item_url):
        global company_page
        html = requests.get(item_url).content
        selector = etree.HTML(html)
        page_infos = selector.xpath('//div[@class="contentBox"]')[0]
        page_halfurl = page_infos.xpath('div[@class="description"]/a/@href')[0]
        txt = page_infos.xpath('div[@class="description"]/text()')[0]
        #提取出该分类下的所有公司的数量
        companys = re.findall(r'\d+', txt)[0]
        # python里有一个向上取整的方法，math.ceil()
        pages = math.ceil(int(companys)/50)
        for page in range(1, pages + 1):
            company_page = []
            page_url = "http://www.socom.cn" + page_halfurl + "&name=&cp=%s" %str(page)
            self.get_company(page_url)
            # 插入多个数据
            newsdata.insert_many(company_page)

    #获取公司的url
    def get_company(self, page_url):
        html = requests.get(page_url).content
        selector = etree.HTML(html)
        company_infos = selector.xpath('//div[@class="contentBox"]')[3]
        company_halfurls = company_infos.xpath('div[@class="cityBox"]/a/@href')
        for company_halfurl in company_halfurls:
            company_url = "http://www.socom.cn" + company_halfurl
            print(company_url)
            self.company_detail(company_url)

    #获取公司的详情
    def company_detail(self, company_url):
        global company_data
        html = requests.get(company_url).text
        selector = etree.HTML(html)
        company_one = selector.xpath('//div[@class="contentBox"]')[1]
        company_name = company_one.xpath('div[@class="provinceBox"]/text()')[0]
        print(company_name)
        company_datas = company_one.xpath('div[@class="cityBox"]/div/text()')
        location = company_datas[0]
        phone = company_datas[1].strip()
        fax = company_datas[2].strip()
        mobile_phone = company_datas[3].strip()
        email = company_datas[5].strip()
        contact = company_datas[6].strip()
        person = company_datas[7].strip()
        capital = company_datas[8].strip()
        type = company_datas[9].strip()
        product = company_datas[10].strip()
        introduction = company_datas[11].strip()
        company_one = {
            'province': '广东',
            'city': city_name,
            'item': item_name,
            'company': company_name,
            'location': location[4:],
            'phone': phone[4:],
            'fax': fax[4:],
            'mobile_phone': mobile_phone[3:],
            'web': company_url,
            'email': email[3:],
            'contact': contact[4:],
            'person': person[5:],
            'capital': capital[5:],
            'type': type[5:],
            'product': product[5:],
            'introduction': introduction[5:]
        }
        company_page.append(company_one)

if __name__ == "__main__":
    #输入要爬取的省的url
    province_url = "http://www.socom.cn/guangdong/"
    p = Company(4)
    p.get_city(province_url)

爬取的结果：

Paste_Image.png

小结:
1、可能数据比较多，爬的有点慢，大家可以用scrapy或者多线程尝试一下

静态网页爬取多页

猜你喜欢

热点阅读