python - 对njhouse的房源信息抓包

2021-12-31  本文已影响0人  温柔vs先生
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@文件        :NJHouse.py
@说明        :https://www.njhouse.com.cn(对njhouse的房源信息抓包)
@时间        :2021/12/28 16:42:57
@作者        :wbb
@版本        :1.0
'''

from os import name
from typing import Mapping, NamedTuple
import requests
from bs4 import BeautifulSoup
import xlwt
from fake_useragent import UserAgent

# 传入图片链接,识别图片
from PIL import Image
import pytesseract
from io import BytesIO


def main():
    datasourceDic = getData()
    # print(datasourceDic)
    saveDataToExcel(datasourceDic)


# 数据保存到Excel


def saveDataToExcel(datasourceDic):

    color_dic = {'ks': '#99cc00', 'rg': '#ffff66',
                 'qy': '#ff9900', 'ba': '#ff0019', 'az': '#ffccff'}

    sheetTitle = datasourceDic['sheetTitle'].replace('[', '').replace(']', '')
    dataList = datasourceDic['dataList']
    detail_dong = datasourceDic['detail_dong']
    dong_header = detail_dong['dong_header']
    dong_table_th_title_list = detail_dong['dong_table_th_title_list']
    dong_table_td_title_list = detail_dong['dong_table_td_title_list']
    room_info_list = datasourceDic['room_info_list']

    workbook = xlwt.Workbook(encoding='utf-8')
    bookSheet = workbook.add_sheet(sheetTitle, cell_overwrite_ok=True)

    font = xlwt.Font()  # Create Font
    font.bold = True  # Set font to Bold

    alignment = xlwt.Alignment()  # Create Alignment  创建对齐
    # May be: 标准化:HORZ_GENERAL, 左对齐:HORZ_LEFT, 水平对齐居中:HORZ_CENTER, 右对齐:HORZ_RIGHT, 填充:HORZ_FILLED, HORZ_JUSTIFIED, HORZ_CENTER_ACROSS_SEL, HORZ_DISTRIBUTED
    alignment.horz = xlwt.Alignment.HORZ_CENTER
    # May be: 顶部对齐:VERT_TOP, 垂直居中:VERT_CENTER, 底部对齐:VERT_BOTTOM, VERT_JUSTIFIED, VERT_DISTRIBUTED
    alignment.vert = xlwt.Alignment.VERT_CENTER
    style = xlwt.XFStyle()  # Create Style 创建样式
    style.alignment = alignment  # Add Alignment to Style  为样式添加对齐
    style.font = font
    # 第一行,第一列
    bookSheet.write_merge(0, 0, 0, len(
        dong_table_th_title_list)-1, label=dong_header, style=style)

    for i in range(len(dong_table_th_title_list)):

        alignment = xlwt.Alignment()  # Create Alignment  创建对齐
        # May be: 标准化:HORZ_GENERAL, 左对齐:HORZ_LEFT, 水平对齐居中:HORZ_CENTER, 右对齐:HORZ_RIGHT, 填充:HORZ_FILLED, HORZ_JUSTIFIED, HORZ_CENTER_ACROSS_SEL, HORZ_DISTRIBUTED
        alignment.horz = xlwt.Alignment.HORZ_CENTER
        # May be: 顶部对齐:VERT_TOP, 垂直居中:VERT_CENTER, 底部对齐:VERT_BOTTOM, VERT_JUSTIFIED, VERT_DISTRIBUTED
        alignment.vert = xlwt.Alignment.VERT_CENTER
        style = xlwt.XFStyle()  # Create Style 创建样式
        style.alignment = alignment  # Add Alignment to Style  为样式添加对齐

        title = dong_table_th_title_list[i]
        bookSheet.write(1, i, label=title)
        content = dong_table_td_title_list[i]
        bookSheet.write(2, i, label=content, style=style)

    count = 4

    for i in range(len(dataList)):
        cengDic = dataList[i]
        # 写入第一列,多少层
        bookSheet.write(i + count, 0, label=cengDic['ceng'])

        room_list = cengDic["roomList"]
        for j in range(len(room_list)):
            roomDic = room_list[j]
            room = roomDic['room']
            area = roomDic['area']
            price = roomDic['price']
            type = roomDic['type']
            room_color = color_dic[type]
            # TODO 给对应的房间设置背景色(需要xlwt自定义背景色)

            bookSheet.write(
                i+count, j+1, label=f'{room}\n面积:{area}\n价格:{price}')

        # borders = xlwt.Borders()
        # borders.left = 1
        # borders.right = 1
        # borders.top = 1
        # borders.bottom = 1
        # borders.bottom_colour = 0x3A

        # style = xlwt.XFStyle()
        # style.borders = borders

        # bookSheet.write(4 + len(dataList), 0, style)
        workbook.save('NJHouse.xls')


# 爬取网页
def getData():

    # 数据源
    datasourceDic = {}
    # 某一栋
    currentUrl = 'https://www.njhouse.com.cn/spf/sales_detail?PRJ_ID=2867150&prjid=2867150&buildid=580130&dm=9幢'
    imgHeaderUrl = 'https://www.njhouse.com.cn/'

    htmlContent = requestUrl(currentUrl).text
    bs = BeautifulSoup(htmlContent, "html.parser")

    # sheet名字
    sheetTitle = bs.select(
        'body > div.main > div.business_centers > div > div.spf_del_title.clearfix > h2')[0].text
    datasourceDic['sheetTitle'] = sheetTitle

    # 某幢的详细信息
    detail_dong = {}
    dong_header = bs.select(
        'body > div.main > div.business_centers > div > div.spf_del_block > div > div > div:nth-child(1) > h2')[0]
    # 丢弃子元素 font  decompose()
    dong_header.font.decompose()
    detail_dong['dong_header'] = dong_header.text.strip()

    dong_table = bs.select(
        'body > div.main > div.business_centers > div > div.spf_del_block > div > div > table:nth-child(2)')[0]
    dong_table_th_list = dong_table.thead.tr.find_all('th')
    dong_table_th_title_list = []
    for dong_table_th_title in dong_table_th_list:
        dong_table_th_title_list.append(dong_table_th_title.text)

    detail_dong["dong_table_th_title_list"] = dong_table_th_title_list

    dong_table_td_list = dong_table.tbody.tr.find_all('td')

    dong_table_td_title_list = []
    for dong_table_td_title in dong_table_td_list:
        dong_table_td_title_list.append(dong_table_td_title.text)
    # print(dong_table_td_title_list)
    detail_dong["dong_table_td_title_list"] = dong_table_td_title_list

    datasourceDic['detail_dong'] = detail_dong

    # 房屋信息情况
    room_info = bs.select(
        'body > div.main > div.business_centers > div > div.spf_del_block > div > div > div.color_nav > ul > li')
    # print(room_info_list)
    room_info_list = []
    for item in room_info:
        room_dic = {}
        type = item.span.get('class')[0]
        title = item.p.text
        room_dic['type'] = type
        room_dic['title'] = title
        room_info_list.append(room_dic)
    datasourceDic['room_info_list'] = room_info_list
    # print(room_info_list)

    # 销售窗口表
    table = bs.find("table", class_='ck_table')
    tr_list = table.tbody.find_all('tr')
    # print(len(tr_list))
    dataList = []
    for ty_item in tr_list:
        dataDic = {}
        ceng = ty_item.find("td", class_="td_h").text.strip()
        dataDic['ceng'] = ceng
        room_list = list(ty_item.children)
        room_list.pop(0)

        ceng_room_list = []
        for i in range(len(room_list)):
            td_item = room_list[i]
            roomDic = {}
            roomDic['type'] = td_item.get("class")[0]
            a_list = td_item.find_all('a')
            # 房间号
            room = a_list[0].text
            print('爬取的房间号', room)
            roomDic['room'] = room
            if i == 0:
                imgList = a_list[1].find_all("img")
                # print(imgList)
                imgsrc1 = imgList[0].get('src')
                imgUrl1 = imgHeaderUrl + imgsrc1
                # saveImage(imgUrl1)
                area = textForImgUrl(imgUrl1).replace("\n", "").strip()
                # print(area)
                roomDic['area'] = area
                imgsrc2 = imgList[1].get('src')
                imgUrl2 = imgHeaderUrl + imgsrc2
                # saveImage(imgUrl2)
                price = textForImgUrl(imgUrl2).replace("\n", "").strip()
                # print(price)
                roomDic['price'] = price
            else:
                area = a_list[1].get_text()
                price = area.split("价格:")[-1]
                area = area.split("价格:")[0]
                area = area.split(":")[-1]
                roomDic['area'] = area
                roomDic['price'] = price
                # print(area, price)

            ceng_room_list.append(roomDic)

        dataDic["roomList"] = ceng_room_list
        dataList.append(dataDic)

    datasourceDic["dataList"] = dataList
    return datasourceDic

# 图片链接转文本
# Tesseract OCR识别


def textForImgUrl(imgUrl):
    response = requestUrl(imgUrl)

    image = Image.open(BytesIO(response.content))
    width, height = image.size
    newsize = (width*2, height*2)
    image = image.resize(newsize)
    text = pytesseract.image_to_string(image)
    return text

# 保存图片到本地


def saveImage(imgUrl):
    imgPath = imgUrl.split("?")[-1]
    imgPath = imgPath.split("&")[0]
    imgPath = imgPath.split("=")[-1]

    with open(imgPath, "wb") as f:
        response = requestUrl(imgUrl)
        img = response.content
        f.write(img)


# 进行网络请求
def requestUrl(url):
    header = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36"}

    response = requests.get(url, headers=header)
    response.encoding = 'utf-8'
    return response


if __name__ == '__main__':
    main()
    # imgUrl = 'https://www.njhouse.com.cn/common/imgmake?num=172355&bg=%2399CC00'
    # textForImgUrl(imgUrl)


上一篇下一篇

猜你喜欢

热点阅读