1_3抓取租房信息_笔记

2016-11-26  本文已影响0人  蜂DAO

最终效果:

最终效果.png

我的代码:

from bs4 import BeautifulSoup
import requests
import time

urls = []
#抓取列表页中的内容页链接
#data:列表页链接
def conUrlFun(data):
    wb_data = requests.get(data)
    Soup = BeautifulSoup(wb_data.text,'lxml')
    conUrls= Soup.select('.pic_list > li > a.resule_img_a')

    for conUrl in conUrls:
        #将爬取到的链接存入urls列表中
        urls.append(conUrl.get('href'))

# 获取指定数量的列表页中内容页链接
# data1,data2:初始页至结束页
def urlFun(data1,data2):
    urls = ["http://bj.xiaozhu.com/search-duanzufang-p{}-0/".format(i) for i in range(data1, data2)]
    for url in urls:
        print(url)
        # 打印页面中的内容页链接
        conUrlFun(url)
    time.sleep(0.5)

# 获取内容页中的标题、地址、价格、图片、昵称、性别、头像
# data1:内容页链接
def conFun(data):
    wb_data = requests.get(data)
    Soup = BeautifulSoup(wb_data.text,'lxml')
    title = Soup.select('.pho_info > h4 > em')[0].get_text()
    addr = Soup.select('.pho_info > p > span')[0].get_text().strip()
    price = Soup.select('.day_l > span')[0].get_text()
    image = Soup.select('.pho_show_big > div > img')[0].get('src')
    avartar = Soup.select('.member_pic > a > img')[0].get('src')
    name = Soup.select('.lorder_name')[0].get_text()
    sexs = Soup.select('.member_pic > div')[0]['class'][0]
    #男 = member_ico  女 = member_ico1 无 = 空
    if sexs == str('member_ico'):
        sex = '男'
    elif sexs == str('member_ico1'):
        sex = '女'
    else:
        sex = '不明'

    datas = {
        "title":title,
        "addr":addr,
        "price":price,
        "image":image,
        "avartar":avartar,
        "name":name,
        "sex":sex
    }
    print(datas)

#conFun('http://bj.xiaozhu.com/fangzi/3795316730.html')
urlFun(1,3)
for url in urls:
    conFun(url)
    time.sleep(1)

学到的知识:

上一篇 下一篇

猜你喜欢

热点阅读