静态网页爬取多页
2017-07-11 本文已影响123人
王小鱼鱻
爬取的网站:企业黄页
Paste_Image.png这里我是抓取一个省的公司,按照这个分类抓取的,
省——市——分类——公司,其实方法不限的;
按照这个顺序一级一级往下获取所需的参数的,其实不是很难,就是数据太多了,爬的太慢,多进程和多线程的,自己还不是很懂,所以慢的很慢,有知道的各位,希望可以给个建议,最好有个最优解决的方法;
不说废话了,看下代码,求各位高手的指导一下:
import requests
from lxml import etree
import re
import math
import json
import pymongo
import multiprocessing
# 连接到Mongo
conn = pymongo.MongoClient(host = 'localhost', port = 27017)
# 选择或创建数据库
company = conn['company']
# 选择或创建数据集合
newsdata = company['infos']
class Company(multiprocessing.Process):
company_data = []
def __init__(self, interval):
multiprocessing.Process.__init__(self)
self.interval = interval
#获取一个省的所有城市
def get_city(self, province_url):
global city_name
html = requests.get(province_url).content
selector = etree.HTML(html)
city_infos = selector.xpath('//div[@class="contentBox"]')[0]
city_names = city_infos.xpath('div[@class="cityBox"]/a/text()')
city_halfurls = city_infos.xpath('div[@class="cityBox"]/a/@href')
for city_name,city_halfurl in zip(city_names,city_halfurls):
city_url = "http://www.socom.cn" + city_halfurl
print(city_name)
print(city_url)
self.get_item(city_url)
#获取公司的分类
def get_item(self,city_url):
global item_name
html = requests.get(city_url).content
selector = etree.HTML(html)
city_infos = selector.xpath('//div[@class="contentBox"]')[1]
item_names = city_infos.xpath('div[@class="provinceBox"]/a/text()')
item_halfurls = city_infos.xpath('div[@class="provinceBox"]/a/@href')
for item_name,item_halfurl in zip(item_names,item_halfurls):
item_url = "http://www.socom.cn" + item_halfurl
print(item_name)
print(item_url)
self.get_page(item_url)
#获取公司每个分类的页数;
def get_page(self, item_url):
global company_page
html = requests.get(item_url).content
selector = etree.HTML(html)
page_infos = selector.xpath('//div[@class="contentBox"]')[0]
page_halfurl = page_infos.xpath('div[@class="description"]/a/@href')[0]
txt = page_infos.xpath('div[@class="description"]/text()')[0]
#提取出该分类下的所有公司的数量
companys = re.findall(r'\d+', txt)[0]
# python里有一个向上取整的方法,math.ceil()
pages = math.ceil(int(companys)/50)
for page in range(1, pages + 1):
company_page = []
page_url = "http://www.socom.cn" + page_halfurl + "&name=&cp=%s" %str(page)
self.get_company(page_url)
# 插入多个数据
newsdata.insert_many(company_page)
#获取公司的url
def get_company(self, page_url):
html = requests.get(page_url).content
selector = etree.HTML(html)
company_infos = selector.xpath('//div[@class="contentBox"]')[3]
company_halfurls = company_infos.xpath('div[@class="cityBox"]/a/@href')
for company_halfurl in company_halfurls:
company_url = "http://www.socom.cn" + company_halfurl
print(company_url)
self.company_detail(company_url)
#获取公司的详情
def company_detail(self, company_url):
global company_data
html = requests.get(company_url).text
selector = etree.HTML(html)
company_one = selector.xpath('//div[@class="contentBox"]')[1]
company_name = company_one.xpath('div[@class="provinceBox"]/text()')[0]
print(company_name)
company_datas = company_one.xpath('div[@class="cityBox"]/div/text()')
location = company_datas[0]
phone = company_datas[1].strip()
fax = company_datas[2].strip()
mobile_phone = company_datas[3].strip()
email = company_datas[5].strip()
contact = company_datas[6].strip()
person = company_datas[7].strip()
capital = company_datas[8].strip()
type = company_datas[9].strip()
product = company_datas[10].strip()
introduction = company_datas[11].strip()
company_one = {
'province': '广东',
'city': city_name,
'item': item_name,
'company': company_name,
'location': location[4:],
'phone': phone[4:],
'fax': fax[4:],
'mobile_phone': mobile_phone[3:],
'web': company_url,
'email': email[3:],
'contact': contact[4:],
'person': person[5:],
'capital': capital[5:],
'type': type[5:],
'product': product[5:],
'introduction': introduction[5:]
}
company_page.append(company_one)
if __name__ == "__main__":
#输入要爬取的省的url
province_url = "http://www.socom.cn/guangdong/"
p = Company(4)
p.get_city(province_url)
爬取的结果:
Paste_Image.png小结:
1、可能数据比较多,爬的有点慢,大家可以用scrapy或者多线程尝试一下