python 爬虫

Python实战作业 第一周:爬取58同城商品页详情

2017-05-24  本文已影响8人  浮生只言片语

任务:

1、获取网址:http://sh.58.com/pbdn 中商品详情链接地址
2、在商品详情页中获取:
类别----category
标题----title
价格----price
区域----area
浏览量--lookTime

成果:

Snip20170524_2.png

代码:


from bs4 import BeautifulSoup
import requests

url = 'http://sh.58.com/pbdn'

def GetGoodsUrls(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')

    goodsUrls = soup.select('#infolist > div > table > tbody > tr > td.t > a')
    data = []
    for goodsUrl in goodsUrls:
        temp = goodsUrl.get('href')
        data_list =temp.split('/')
        #print(data_list)
        if data_list[2] == 'zhuanzhuan.58.com':
            data.insert(-1,temp.split('?')[0])
    print(data)
    return data

def GetGoodsInfo(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')

    categorys = soup.select('#nav > div > span > a')
    titles = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
    prices = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')
    areas = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
    lookTimes = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')
    del categorys[0:1]
    for category,title,price,area,lookTime in zip(categorys,titles,prices,areas,lookTimes):
        data = {
            'category':category.get_text(),
            'title':title.get_text(),
            'prices':price.get_text(),
            'area':area.get_text(),
            'lookTime':lookTime.get_text()
        }
    print(data)

for i in GetGoodsUrls(url):
    GetGoodsInfo(i)
上一篇 下一篇

猜你喜欢

热点阅读