基于bs4+requests的蓝房网爬虫(进阶版)
2018-01-29 本文已影响50人
潇洒坤
1.代码可以直接运行,请下载anaconda并安装,用spyder方便查看变量
或者可以查看生成的excel文件
2.依赖库,命令行运行(WIN10打开命令行快捷键:windows+x组合键,然后按a键):
pip install BeautifulSoup4
pip install requests
3.爬取的网站是蓝房网(厦门)二手房,可以进入http://xm.esf.lanfw.com/sell_zhuzhai/p1?keyword=/进行观察
4.关于如何判断代码是python2还是python3,print('')为python3,print ''为python2
简而言之就是print需要用括号的就是python3,下面代码如是。
5.爬取538个页面并进行解析,程序运行后需要等待大概500秒
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 15 23:30:28 2018
@author: Administrator
"""
def getHousesDetails(url):
import requests
from bs4 import BeautifulSoup
request = requests.get(url)
request.encoding = 'utf-8'
soup = BeautifulSoup(request.text,'lxml')
houses = soup.select('.houseTxt')
housesDetails = []
for house in houses:
title = house.select('.txtLeft h2 a')[0].text
communityNameAndAddress = house.select('.txtLeft p')[0].text.strip('查看地图').split()
communityName = communityNameAndAddress[0]
if(len(communityNameAndAddress) == 2 ):
address = communityNameAndAddress[1]
else:
address =''
details = house.select('.txtLeft p')[1].text.split(' | ')
print(details)
houseSizeType = details[0]
houseFloor = details[1]
houseDecoration = details[2]
houseBuiltTime = details[3]
if len(details) == 6 :
houseOrientation = details[4]
houseUnitPrice = details[5]
elif len(details) == 5 :
houseOrientation = ''
houseUnitPrice = details[4]
elif len(details) == 4 :
houseDecoration = ''
houseOrientation = ''
houseBuiltTime = details[2]
houseUnitPrice = details[3]
price = house.select('.housePrice')[0].text
squaremeter = house.select('.squaremeter')[0].text
keywords = house.select('.houseTab')[0].text
#上面是获取房子的信息,下面将其做成字典
houseDetails = {
'title' : title,
'communityName' : communityName,
'address' : address,
'houseSizeType': houseSizeType,
'houseFloor' : houseFloor,
'houseDecoration' : houseDecoration,
'houseBuiltTime' : houseBuiltTime,
'houseOrientation' : houseOrientation,
'houseUnitPrice' : houseUnitPrice,
'price' : price,
'squaremeter' : squaremeter,
'keywords' : keywords
}
housesDetails.append(houseDetails)
return housesDetails
def getAllHousesDetails():
maxPageNumber = 538
urlBefore = 'http://xm.esf.lanfw.com/sell_zhuzhai/p{}?keyword='
allHousesDetails = []
for i in range(1,maxPageNumber+1):
url = urlBefore.format(i)
allHousesDetails.extend(getHousesDetails(url))
import pandas
dataFrame = pandas.DataFrame(allHousesDetails)
return dataFrame
if __name__ == '__main__':
allHousesDetails = getAllHousesDetails()
allHousesDetails.to_excel('lanfwSecondHandHouseDetails2.xlsx')