爬取房价

2019-11-11  本文已影响0人  Suria007

代码


任务.png

房天下页面

房天下页面.jpg

使用工具,jupyter notebook

首先导入各个库,如果不存在,就先通过 pip install 进行安装

import requests
from bs4 import BeautifulSoup
import csv

def pachong(inputurl):
    req=requests.get(url=inputurl)
    req.encoding='gb2312'
    html=req.text
    bf=BeautifulSoup(html,"lxml")
    url=bf.find_all('div',class_='page')   
    div=bf.find_all('div',class_='nl_con clearfix')#楼盘信息
    
    #print(type(div[0]))
    x=BeautifulSoup(str(div[0]),"lxml")
    
    try:
        h=x.find_all('div',class_='kanesf')#特殊情况,新房已售出
        h[0]['class']='nhouse_price'#将属性换掉
    except:
        pass
    y=x.find_all('div',class_='nhouse_price')#楼盘价格
    a=x.find_all('div',class_='address')#楼盘地址
    #print(type(y[0]))
    b=BeautifulSoup(str(a),"lxml")
    c=b.find_all('a')
    #print(y[1].label.string)#存在有价格待定的格式
    
    try:
        for i in range(20):
            z=y[i].get_text()
            '''if y[i].span:
                e=y[i].span.string
            else:
                e=y[i].label.string+y[i].i.string'''
            
           if y[i].em:
                #if str(y[i].em.string)[-2:]=='�O' or str(y[i].em.string)[-3:-1]=='�O':
                if z[-3:-1]=='�O' or z[-4:-2]=='�O':
                    z=z.replace('�O','m^2')
                #else:
                 #   z=y[i].em.string
            #else:
            #    z=''
            
            #price=e+z
            writer.writerow([str(c[i]['title']),z])
            print(c[i]['title'],z)
    
    #csvfile.close()
    
    except:
        pass

target='https://newhouse.fang.com/house/s/?ctm=1.bj.xf_search.lpsearch_area.1'
req=requests.get(url=target)
req.encoding='gb2312'
furl='https://newhouse.fang.com'#url前缀
html=req.text

bf=BeautifulSoup(html,"lxml")
li=bf.find_all('li',class_='quyu_name dingwei')
lin=BeautifulSoup(str(li),"lxml")
link=lin.find_all('a')
location=['朝阳','海淀','丰台','西城','东城','昌平','大兴','通州','房山','顺义','石景山','密云','门头沟','怀柔','延庆','平谷','燕郊','北京周边']
for i in range(1,19):
    csvfile=open(location[i-1]+'.csv','a+',newline='',encoding='utf-8')#创建一个写入csv
    writer=csv.writer(csvfile)
    writer.writerow(['具体位置','房价'])
    links=furl+link[i]['href']
    pachong(links)#爬取该网页第一页的信息
    req=requests.get(url=links)
    req.encoding='gb2312'
    html=req.text
    bf=BeautifulSoup(html,"lxml")
    url=bf.find_all('div',class_='page')
    #print(url[0])
    try:
        page=url[0].find_all('a')
        if len(page)>0:
            for i in page[1:-2]:
                nurl=furl+i['href']#收集此地区跳转页面url
                pachong(nurl)
    except:
        pass
    csvfile.close()#关闭csv

效果

csv文件.png jupyter notbook 显示效果.png
csv文件内容.png
做了一整天,终于搞出来了,难点在于如何排查需要的内容,有的房价直接显示,有的待定,有的售空,这就需要设置条件。
上一篇下一篇

猜你喜欢

热点阅读