python爬虫python学习python开发

爬取京东商品信息

2017-02-06  本文已影响129人  _weber_
功能

从京东商城的商品列表页面解析出商品详情页链接,进而解析出商品名称,编号,店铺,品类等信息

代码片段及说明
import requests
from bs4 import BeautifulSoup
import time
def get_detail_urls(list_url):
    """从商品列表页list_url解析出商品详情页detail_urls"""
    time.sleep(2)
    res = requests.get(list_url)
    soup = BeautifulSoup(res.text, 'lxml')
    data = soup.select('div.jDesc > a')
    detail_urls = []
    for detail_url in data:
        detail_urls.append(detail_url.get('href'))
    return detail_urls
def get_detail(detail_url):
    """从商品详情页获取商品名称,编号,店铺和分类信息"""
    time.sleep(2)
    res = requests.get(detail_url)
    soup = BeautifulSoup(res.text, 'lxml')
    sku_name = soup.select('div.p-parameter > ul > li:nth-of-type(1)')[0].get_text().split(':')[-1]
    sku_num = soup.select('div.p-parameter > ul > li:nth-of-type(2)')[0].get_text().split(':')[-1]
    shop = soup.select('div.p-parameter > ul > li:nth-of-type(3)')[0].get_text().split(':')[-1]
    cate_tmp = soup.select('div.p-parameter > ul > li:nth-of-type(5)')
    if cate_tmp != []:
        cate = cate_tmp[0].get_text().split(':')[-1]
    else: cate = cate_tmp
    data = {
        'sku_name':sku_name,
        'sku_num':sku_num,
        'shop':shop,
        'cate':cate,
        }
    return data
if __name__ == '__main__':
    list_urls = ['https://module-jshop.jd.com/module/getModuleHtml.html?appId=496014&orderBy=3&pageNo={}' \
                 '&direction=1&categoryId=0&pageSize=24&pagePrototypeId=8&pageInstanceId=27448388&moduleInstanceId=27448395' \
                 '&prototypeId=68&templateId=401682&layoutInstanceId=27448395&origin=0&shopId=162403&venderId=167694'.format(str(i)) for i in range(1, 100)]
    detail_urls = []
    for list_url in list_urls:
        detail_urls.extend(get_detail_urls(list_url))
    for detail_url in detail_urls:
        data = get_detail('https:' + detail_url)
        data['detail_url'] = 'https:' + detail_url
        print(data)
输出结果
京东商品信息
上一篇下一篇

猜你喜欢

热点阅读