Python实战 - 第5节:开始使用MongoDB

2016-11-08  本文已影响0人  辉叔不太萌

笔记

作业

import pymongo
from bs4 import BeautifulSoup
import requests
import time

\# MongoDB connect
client = pymongo.MongoClient('localhost', 27017)
xiaozhu = client['xiaozhu']
duanzufang = xiaozhu['duanzufang']

\# URL Parse
urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, 4)]


def parse_gender(gender_class):

    if gender_class == 'member_ico1':
        return '女'
    elif gender_class == 'member_ico':
        return '男'
    else:
        return '未知'


def parse_datail_page(url):

    web_data = requests.get(url)
    soap = BeautifulSoup(web_data.text, 'lxml')

    titles = soap.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
    addrs = soap.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
    rants = soap.select('#pricePart > div.day_l > span')
    pics = soap.select('#curBigImage')
    owner_pics = soap.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
    gender_class = soap.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0]['class'][0]
    owner_names = soap.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')

    data = {
        'title': titles[0].get_text(),
        'addr': addrs[0].get_text().strip(),
        'rant': int(rants[0].get_text()),
        'pic': pics[0].get('src'),
        'owner_pic': owner_pics[0].get('src'),
        'owner_name': owner_names[0].get_text(),
        'gender': parse_gender(gender_class),
    }

    print(data)
    \# insert to MongoDB
    duanzufang.insert_one(data)
    time.sleep(1)


def parse_list_page(url):

    web_data = requests.get(url)
    soap = BeautifulSoup(web_data.text, 'lxml')

    detail_urls = soap.select('#page_list > ul > li > a')
    for detail_url in detail_urls:
        parse_datail_page(detail_url.get('href'))


for url in urls:
    parse_list_page(url)

import pymongo

client = pymongo.MongoClient('localhost', 27017)
xiaozhu = client['xiaozhu']
duanzufang = xiaozhu['duanzufang']

for item in duanzufang.find({'rant':{'$gte':500}}):
    print(item)

上一篇下一篇

猜你喜欢

热点阅读