爬取房天下二手房数据和二手房房价分析及预测

2020-07-11 本文已影响0人雷全龙

第一部分爬虫

数据来源：房天下
网页结构分析

通过抓包分析网页信息，房源信息就是包含在当前HTML文件中。
目标URL规律：

第一页：https://lz.esf.fang.com/house/i31/
第二页：https://lz.esf.fang.com/house/i32/
第三页：https://lz.esf.fang.com/house/i33/
......
第十页：https://lz.esf.fang.com/house/i310/
从中可以看出，变化的只是最后面的一部分，那么实现多页爬取时构造新的URL就比较容易。（拼接页数就可以）

爬取内容分析
为了得到更多的有用信息，需要进行详情页的跳转，也就是说，首先获取详情页链接，然后在请求获取房源信息。

info1.png
info2.png
难点

重定向

当兴高采烈的拿着地址去访问的时候，返回信息如下，脸黑了，说明发生了重定向问题。在请求该地址后，会出现短暂的“跳转”字眼。

跳转.png

那么我们就在这个网页信息里查找下一个请求地址，如下图可以看到点击跳转前有我们想要的信息，这是目标网页请求地址。

点击跳转.png
在进行详情页跳转的时候也存在这个问题，分析思路是一样的。

验证码

selenium + 云打码平台解决或者人工输入

爬取思路

过程.png
代码实现

详情页地址

#!/user/bin/env python3
# -*- coding: utf-8 -*-

import requests
# 自定义的UA库
from UA import ua
import random
from lxml import etree
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from selenium import webdriver

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {'User-Agent': ''}

def get_detail_url(url):

    headers['User-Agent'] = random.choice(ua)
    try:
        r = requests.get(url, headers=headers, verify=False)
        html = etree.HTML(r.text)
        # 经过上述跳转，得到目标网页地址
        roal_url = html.xpath('//a[@class="btn-redir"]/@href')[0]
        r = requests.get(roal_url, headers=headers, verify=False)
        html = etree.HTML(r.text)
        hrefs = html.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@href')
        channels = html.xpath('//div[@class="shop_list shop_list_4"]/dl/dt/a/@data_channel')
        next_urls = ['https://lz.esf.fang.com' + href +'?channel=' + channel for href,channel in zip(hrefs,channels)]
        house.extend(next_urls)
    except:
        process_captcha()
        get_detail_url(url)

def process_captcha():

    # 该处url是让出现验证码界面，没有具体的限制
    url = 'https://lz.esf.fang.com/chushou/3_416752691.htm?channel=2,2'
    driver = webdriver.Firefox()
    driver.get(url)
    # 人工输入验证码
    time.sleep(12)
    driver.find_element_by_name('submit').click()
    driver.close()

if __name__ == '__main__':

    '''
   这个过程中，貌似只能爬取100页，那么可以细化，比如分区域爬取，可以再细分。
    '''
    house = []
    for i in range(1,100):
        print('--------------------------------')
        print(f'开始爬取第{i}页')
        url = f'https://lz.esf.fang.com/house/i3{i}/'
        get_detail_url(url)
    print('爬取结束！')
    f = open('urls.txt', 'a+', encoding='utf8')
    for i in house:
        f.write(i + '\n')
    f.close()

房屋信息

#!/user/bin/env python3
# -*- coding: utf-8 -*-

import requests
from UA import ua
import random
from lxml import etree
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import csv
from selenium import webdriver
from PIL import Image
# 云打码平台API
from vcode import *

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {'User-Agent': ''}

def get_info(url):

    headers['User-Agent'] = random.choice(ua)
    # 解决验证码反爬虫问题
    try:
        r = requests.get(url, headers=headers, verify=False, timeout=60)
        html = etree.HTML(r.text)
        detail_url = html.xpath('//a[@class="btn-redir"]/@href')[0]
        r = requests.get(detail_url, headers=headers, verify=False, timeout=60)
        html = etree.HTML(r.text)
        total_price = html.xpath('//div[@class="tab-cont-right"]/div[1]/div[1]/div[1]/i/text()')[0]
        style = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][1]/div[1]/div[1]/text()')[
            0].replace('\n', '').strip()
        area = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][1]/div[2]/div[1]/text()')[0]
        unit_price = \
        html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][1]/div[3]/div[1]/text()')[0]
        direction = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][2]/div[1]/div[1]/text()')[
            0]
        floor = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][2]/div[2]/div[1]/text()')[0]
        decoration = \
        html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line clearfix"][2]/div[3]/div[1]/text()')[0]
        local = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line"]/div[2]/div[2]/a[1]/text()')[0].replace(
            '\n', '').strip()
        school = html.xpath('//div[@class="tab-cont-right"]/div[@class="tr-line"]/div[3]')
        if len(school):
            school = 1
        else:
            school = 0
        data = {'总价': total_price,
                '户型': style,
                '建筑面积': area,
                '单价': unit_price,
                '朝向': direction,
                '楼层': floor,
                '装修': decoration,
                '区域': local,
                '学校': school}
        content = {'建筑年代': '',
                   '有无电梯': '',
                   '产权性质': '',
                   '住宅类别': '',
                   '建筑结构': '',
                   '建筑类别': ''}
        info = html.xpath('//div[@class="content-item fydes-item"]/div[2]//span/text()')
        for i in range(int((len(info) - 2) / 2)):
            content[info[2 * i]] = info[2 * i + 1]
        to_csv(data, content)
    except:
        title = process_captcha(url)
        # 检验url是否有效，推测原因是房源信息已经不存在了，
        # 如果存在，则重新请求，反之，就结束当前请求，开始下一个请求
        if title == '兰州二手房-房天下':
            delete.append(url)
            pass
        else:
            # 防止一个请求循环进行，导致一直使用验证码平台，进行下一个
            if url in flag:
                return
            flag.append(url)
            get_info(url)

def to_csv(data,content):
    with open('house.csv', 'a+', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([data['户型'], data['建筑面积'], data['朝向'], data['楼层'], data['装修'],
                        content['建筑年代'], content['有无电梯'], content['产权性质'], content['住宅类别'],
                        content['建筑结构'], content['建筑类别'], data['区域'],
                        data['学校'], data['总价'], data['单价']])

def process_captcha(url):

    driver = webdriver.Firefox()
    driver.get(url)
    print(url)
    driver.save_screenshot('code.png')
    left = 700
    top = 340
    right = 900
    bottom = 405
    im = Image.open('code.png')
    im = im.crop((left, top, right, bottom))
    im.save('captcha.png')
    # 实例化，需要自己的账号、密码、验证码对应类型
    cjy = Chaojiying_Client(你的账号，你的密码， '902223')
    im = open('captcha.png', 'rb').read()
    code = cjy.PostPic(im,1004).get('pic_str')
    driver.find_element_by_id('code').send_keys(code)
    time.sleep(1)
    driver.find_element_by_name('submit').click()
    time.sleep(2)
    driver.get(url)
    title = driver.title
    driver.close()
    return title

if __name__ == '__main__':

    house = []
    f = open('urls.txt')
    texts = f.readlines()
    for text in texts:
        house.append(text.rstrip())
    house = list(set(house))
    f.close()
    delete = []
    flag = []
    for i in range(2852,len(house)):
        print(f'开始爬取第{i+1}条信息')
        get_info(house[i])
    print('爬取结束！')

结果展示

结果.png

共计将近8000多条数据，和下图对应。

总体情况.png
数据详情见：https://www.kesci.com/home/dataset/5f073e5ac94d2e002d03522d/files

二、数据分析

项目详情见：https://www.kesci.com/home/project/5f098536192ac2002c87c5aa

爬取房天下二手房数据和二手房房价分析及预测

第一部分爬虫

二、数据分析

每天进步一点点！

猜你喜欢

热点阅读

爬取房天下二手房数据和二手房房价分析及预测

第一部分 爬虫

二、数据分析

每天进步一点点！

猜你喜欢

热点阅读

第一部分爬虫