第一周第三课时

2016-05-21  本文已影响21人  采矿
抓取的详情页链接 详情页的详细信息
from bs4 import BeautifulSoup
import requests
import time
sourceurls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, 14)]
detail_urls = []
detailku = []


def get_detail_url(urls):
    web_data = requests.get(urls)
    time.sleep(2)
    soup = BeautifulSoup(web_data.text, 'lxml')
    for url in soup.select('#page_list > ul > li > a'):
        detail_url = url.get('href')
        detail_urls.append(detail_url)
    print(detail_urls, len(detail_urls))
# sourceurls 是一个列表,所以还需要一个个的取出来放到函数里
for single_url in sourceurls:
    get_detail_url(single_url)


def get_dtail_info(url):
    web_data = requests.get(url)
    time.sleep(1)
    soup = BeautifulSoup(web_data.text, 'lxml')
    titles = soup.select('h4 > em')
    areas = soup.select('span.pr5')
    day_prices = soup.select('div.day_l > span')
    house_pics = soup.select('#curBigImage')
    landlord_pics = soup.select('div.member_pic > a > img')
    if soup.find_all('div', 'member_ico'):
        landlord_genders = '男'
    else:
        landlord_genders = '女'
    landlord_names = soup.select('a.lorder_name')
    for title, area, day_price, house_pic, landlord_pic, landlord_gender, landlord_name in zip(titles, areas, day_prices,
                                                                                               house_pics, landlord_pics,
                                                                                               landlord_genders,
                                                                                               landlord_names):
        data = {
            'title': title.get_text(),
            'area': area.get_text(),
            'day_price': day_price.get_text(),
            'house_pic': house_pic.get('src'),
            'landlord_pic':landlord_pic.get('src'),
            'landlord_gender': landlord_gender,
            'landlord_name': landlord_name.get_text()
        }
        detailku.append(data)
        print(data, len(detailku))
for detail_sinngle_url in detail_urls:
        get_dtail_info(detail_sinngle_url)
上一篇 下一篇

猜你喜欢

热点阅读