淘宝爬虫爬取商品详情和销量
2019-02-18 本文已影响0人
探索者_逗你玩儿
废话不说直接上代码,由于获取销量的接口需要登录后的cookies,并且需要指定获取的权限,所以需要在web上登录一次,然后在通过代码获取到销量字段
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
from module.TaobaoItem import TaobaoItem
import json
import urllib
from pycookiecheat import chrome_cookies
from module.CookiesUtil import get_cookie_path
class TaobaoProcessor(object):
def process(self,url):
browser = webdriver.Chrome()
browser.get(url)
browser.implicitly_wait(10)
# soap = BeautifulSoup(browser.page_source)
# print(soap.prettify())
meta = browser.find_element_by_xpath("/html/head/meta[9]")
meta_content = meta.get_attribute("content")
userid = re.findall(r".userid=(\d+)", meta_content)
title = browser.find_element_by_class_name('tb-main-title').text
origin_price = browser.find_element_by_id('J_StrPrice').find_element_by_class_name('tb-rmb-num').text
# new_price = browser.find_element_by_class_name("tb-promo-price").find_element_by_class_name('tb-rmb-num').text
# new_price = WebDriverWait(browser,5).until(lambda x: x.find_element_by_class_name("tb-promo-price").find_element_by_class_name('tb-rmb-num')).text
imgs = browser.find_elements_by_xpath('//ul[@id="J_UlThumb"]/li/div/a/img')
img_list = []
for img in imgs:
img_url = img.get_attribute('src')
img_url = "_".join(img_url.split("_")[:-1])
img_list.append(img_url)
chima = browser.find_element_by_xpath('//*[@id="J_isku"]/div/dl[1]/dt').text
chima_element = browser.find_element_by_xpath('//*[@id="J_isku"]/div/dl[1]/dd/ul').text
span_list = []
for span in chima_element.split("\n"):
span_list.append(span)
color_pro = browser.find_element_by_xpath('//*[@id="J_isku"]/div/dl[2]/dt').text
color_val = browser.find_elements_by_xpath('//*[@id="J_isku"]/div/dl[2]/dd/ul/li/a/span')
color_list = []
for color in color_val:
color_list.append(color.get_attribute('innerHTML'))
attribute = browser.find_element_by_id("attributes")
attr_val = attribute.get_attribute("innerHTML")
description = browser.find_element_by_id("description")
description_val = description.get_attribute("innerHTML")
sale_count = browser.find_element_by_id("J_SellCounter").get_attribute('innerHTML')
comment_count = browser.find_element_by_id("J_RateCounter").text
item = TaobaoItem()
item.title = title
item.origin_price = origin_price
# item.new_price = new_price
item.img_list = img_list
item.chima = chima
item.color_pro = color_pro
item.span_list = span_list
item.color_list = color_list
item.attr_val = attr_val
item.description_val = description_val
item.sale_count = sale_count
item.comment_count = comment_count
print(item.__dict__)
def get_price(self,userid,itemid):
url = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=" + str(itemid)+"&sellerId=" + str(userid)+"&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess"
req = requests.session()
_headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'authority': 'detailskip.taobao.com',
'method': 'GET',
}
resp = req.get(url, headers=_headers)
content = resp.content.decode()
content = re.findall(r"onSibRequestSuccess\((.+)\)",content)
data = json.loads(content[0])
new_price = data['data']['promotion']['promoData']['def']['price']
print(new_price)
def process_html(self,url):
req = requests.session()
_headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
resp = req.get(url, headers=_headers)
content = resp.content
soap = BeautifulSoup(content)
print(soap.prettify())
def taobao_spider(self,url):
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.3',
'Referer': 'https://item.taobao.com/item.htm',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Connection': 'keep-alive',
}
goods_id = re.findall('id=(\d+)', url)[0]
try:
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req).read().decode('gbk', 'ignore')
except Exception as e:
print('无法打开网页:', e.reason)
try:
title = re.findall('<h3 class="tb-main-title" data-title="(.*?)"', res)
title = title[0] if title else None
line_price = re.findall('<em class="tb-rmb-num">(.*?)</em>', res)[0]
des_url = re.findall(r"descUrl\s+:(.+)", res)
sellerid = re.findall(r"sellerId\s+:\s\'(.+)\'",res)[0]
des_url = des_url[0].split(":")[2]
des_url = re.findall(r"\'(.+)\'", des_url)
des_url = 'https:' + des_url[0]
des_request = urllib.request.Request(url=des_url, headers=headers)
des_resp = urllib.request.urlopen(des_request).read()
des_val = des_resp.decode('gbk', 'ignore')
des_val = re.findall(r"var desc=((.+\s)+)",des_val)
des_val = des_val[0][0]
des_val = des_val.replace("';",'')
des_val = des_val.replace("'", '')
des_val = des_val.replace("\\",'')
soap = BeautifulSoup(res)
desciption = str(soap.find_all('div',id='attributes')[0])
# 30-42行为抓取淘宝商品真实价格,该数据是动态加载的
# purl = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId={}&sellerId={}&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess".format(goods_id,sellerid)
purl = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId={}&sellerId={}&modules=price,soldQuantity".format(
goods_id,sellerid)
headers['Referer'] = url
cookie_path = get_cookie_path()
cookies = chrome_cookies('https://item.taobao.com/', cookie_path)
cookie_str = ''
for k in cookies:
cookie_str = cookie_str + k+"="+cookies[k]+";"
headers['Cookie'] = cookie_str
price_req = urllib.request.Request(url=purl, headers=headers)
price_res = urllib.request.urlopen(price_req).read()
resp_data = price_res.decode()
data = list(set(re.findall('"price":"(.*?)"', resp_data)))
confirm_sell = re.findall(r'"confirmGoodsCount":"(.*?)"', resp_data)[0]
sell_total = re.findall(r'"soldTotalCount":"(.*?)"', resp_data)[0]
# data列表中的价格可能是定值与区间的组合,也可能只是定值,而且不一定有序
real_price = ""
for t in data:
if '-' in t:
real_price = t
break
if not real_price:
real_price = sorted(map(float, data))[0]
# 45-53行为抓取评论数据,该数据也是动态加载的
# comment_url = "https://rate.tmall.com/list_detail_rate.htm?itemId={}&sellerId=880734502Page=1".format(
# goods_id)
# comment_data = urllib.request.urlopen(comment_url).read().decode("gbk", "ignore")
# print(comment_data)
# temp_data = re.findall('("commentTime":.*?),"days"', comment_data)
# temp_data = temp_data if temp_data else re.findall('("rateContent":.*?),"reply"', comment_data)
# comment = ""
# for data in temp_data:
# comment += data.encode('utf-8')
# comment = comment if comment else "暂无评论"
except Exception as e:
print('数据抽取失败!!!')
print('商品名:', title)
print('划线价格:', line_price)
print('真实价格:', real_price)
print('商品链接:', url)
# print('部分评论内容:', comment)
print('确认订单:',confirm_sell)
print('30天内销售订单:', sell_total)
print("描述:",desciption,des_val)
if __name__ == '__main__':
# text = """
# onSibRequestSuccess({"code":{"code":0,"message":"SUCCESS"},"data":{"viewer":{"admin":false,"bs":"","buyDomain":"buy.taobao.com","buyerId":"","cartDomain":"cart.taobao.com","cc":false,"countryCode":"CN","ctUser":false,"lgin":false,"serviceTab":"ITEM","tkn":"5759fa35b33bb"},"deliveryFee":{"data":{"areaId":440100,"areaName":"\u5E7F\u4E1C\u5E7F\u5DDE","sendCity":"\u6D59\u6C5F\u676D\u5DDE","serviceInfo":{"list":[{"id":"100_-4","info":"\u5FEB\u9012 \u514D\u8FD0\u8D39","isDefault":true,"markInfo":"7\u5929\u5185\u53D1\u8D27"}]}},"dataUrl":"\/\/detailskip.taobao.com\/json\/deliveryFee.htm","message":"ok","success":true},"upp":{"3790159085892":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","3790159085893":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","-2":"\u6DD8\u91D1\u5E01\u6700\u9AD8\u53EF\u62B5\u5546\u54C1\u4EF7<em class='tb-h'> 2%<\/em>","3790159085894":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","3790159085895":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","-5":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","3790159085890":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>","3790159085891":"<em class='tb_red'><strong>192<\/strong>\u6DD8\u91D1\u5E01<\/em><em class='tb_dashes_box'> \u62B5\uFFE51.92<\/em> <em class='tb_tjb_price'>\uFFE594.08<\/em>"},"originalPrice":{";20509:28315;1627207:28320;":{"price":"99.00"},";20509:28314;1627207:28338;":{"price":"99.00"},";20509:28315;1627207:28341;":{"price":"99.00"},"def":{"price":"99.00"},";20509:28314;1627207:28341;":{"price":"99.00"},";20509:28315;1627207:28338;":{"price":"99.00"},";20509:28314;1627207:28320;":{"price":"99.00"}},"activity":{"bigpromotion":[{"bg":"\/\/img.alicdn.com\/tfs\/TB1yqnZr0knBKNjSZKPXXX6OFXa-480-40.png","img":["\/\/img.alicdn.com\/tfs\/TB1yqnZr0knBKNjSZKPXXX6OFXa-480-40.png","\/\/img.alicdn.com\/tfs\/TB18j50sk7mBKNjSZFyXXbydFXa-330-40.png"],"time":1536508800000,"type":"pre"},{"bg":"\/\/img.alicdn.com\/tfs\/TB172yRsiAnBKNjSZFvXXaTKXXa-480-40.png","img":["\/\/img.alicdn.com\/tfs\/TB172yRsiAnBKNjSZFvXXaTKXXa-480-40.png","\/\/img.alicdn.com\/tfs\/TB1NQGysXkoBKNjSZFkXXb4tFXa-330-40.png"],"time":1536836400000,"type":"start"},{"time":1537113599000,"type":"end"}]},"price":"99.00","tradeContract":{"pay":[{"icons":["\/\/img.alicdn.com\/tfs\/TB1KTHfQFXXXXbnXFXXXXXXXXXX-16-16.png","\/\/img.alicdn.com\/tfs\/TB1XeDvQFXXXXc5XXXXXXXXXXXX-32-32.png"],"title":"\u8682\u8681\u82B1\u5457","url":"\/\/payservice.alipay.com\/intro\/index.htm?c=hb"},{"icons":["\/\/img.alicdn.com\/tfs\/TB1w6O3QFXXXXX4aXXXXXXXXXXX-16-16.png","\/\/img.alicdn.com\/tfs\/TB1c7HAQFXXXXakXXXXXXXXXXXX-32-32.png"],"title":"\u4FE1\u7528\u5361\u652F\u4ED8","url":"\/\/payservice.alipay.com\/intro\/index.htm?c=xyk"},{"icons":["\/\/img.alicdn.com\/tfs\/TB1dvGWQFXXXXcFaXXXXXXXXXXX-16-16.png","\/\/img.alicdn.com\/tfs\/TB1FdDlQFXXXXa5XpXXXXXXXXXX-32-32.png"],"title":"\u96C6\u5206\u5B9D","url":"\/\/jf.alipay.com"}],"service":[{"desc":"\u6EE1\u8DB37\u5929\u65E0\u7406\u7531\u9000\u6362\u8D27\u7533\u8BF7\u7684\u524D\u63D0\u4E0B\uFF0C\u5305\u90AE\u5546\u54C1\u9700\u8981\u4E70\u5BB6\u627F\u62C5\u9000\u8D27\u90AE\u8D39\uFF0C\u975E\u5305\u90AE\u5546\u54C1\u9700\u8981\u4E70\u5BB6\u627F\u62C5\u53D1\u8D27\u548C\u9000\u8D27\u90AE\u8D39\u3002","icons":["\/\/img.alicdn.com\/tps\/i1\/T1EQA5FpVgXXceOP_X-16-16.jpg",null],"linkType":1,"title":"7\u5929\u65E0\u7406\u7531"},{"icons":["\/\/img.alicdn.com\/tfs\/TB1CgB6QVXXXXbwXXXXXXXXXXXX-16-16.png",null],"linkType":2,"title":"\u65B0\u54C1","url":"\/\/service.taobao.com\/support\/knowledge-1138476.htm?spm=2013.1.1000372.17.3wGlNf"}]},"dynStock":{"holdQuantity":0,"sellableQuantity":911,"sku":{";20509:28315;1627207:28320;":{"holdQuantity":0,"oversold":false,"sellableQuantity":557,"stock":557},";20509:28314;1627207:28338;":{"holdQuantity":0,"oversold":false,"sellableQuantity":542,"stock":542},";20509:28315;1627207:28341;":{"holdQuantity":0,"oversold":false,"sellableQuantity":911,"stock":911},";20509:28314;1627207:28341;":{"holdQuantity":0,"oversold":false,"sellableQuantity":911,"stock":911},";20509:28315;1627207:28338;":{"holdQuantity":0,"oversold":false,"sellableQuantity":552,"stock":552},";20509:28314;1627207:28320;":{"holdQuantity":0,"oversold":false,"sellableQuantity":425,"stock":425}},"stock":911,"stockType":"channel"},"qrcodeImgUrl":"\/\/gcodex.alicdn.com\/qrcode.do?biz_code=xcode&short_name=a.ZRs8&cmd=createSub¶m=id:576081757954;scm:20140619.pc_detail.itemId.0","couponActivity":{"buyerHasMianxi":false,"coupon":{"couponList":[{"activityId":"d71180f1c5d14d18aa2dca099dc7c46c","sellerId":"2448721589","icon":["\/\/img.alicdn.com\/tps\/TB1xlnONpXXXXa9aXXXXXXXXXXX-80-16.png","\/\/img.alicdn.com\/tps\/TB1HZofNpXXXXacXpXXXXXXXXXX-155-32.png"],"type":"shopcoupon","title":"50\u5143\u5E97\u94FA\u4F18\u60E0\u5238\uFF0C\u6EE1499\u5143\u53EF\u7528","isGot":false},{"activityId":"3c89bb542b3d49cd9fe942f102961bd0","sellerId":"2448721589","icon":["\/\/img.alicdn.com\/tps\/TB1xlnONpXXXXa9aXXXXXXXXXXX-80-16.png","\/\/img.alicdn.com\/tps\/TB1HZofNpXXXXacXpXXXXXXXXXX-155-32.png"],"type":"shopcoupon","title":"30\u5143\u5E97\u94FA\u4F18\u60E0\u5238\uFF0C\u6EE1299\u5143\u53EF\u7528","isGot":false}]},"shopProm":[{"icon":["\/\/img.alicdn.com\/tfs\/TB1ZrfnRFXXXXXgXXXXXXXXXXXX-57-16.png","\/\/img.alicdn.com\/tfs\/TB1qX5SRFXXXXciXFXXXXXXXXXX-116-32.png"],"type":"kdmnajian","title":"9\/13-9\/16\u6BCF\u6EE1199\u51CF10,\u4E0A\u4E0D\u5C01\u9876"},{"icon":["\/\/img.alicdn.com\/tfs\/TB1Kz8VQFXXXXa6XFXXXXXXXXXX-56-16.png","\/\/img.alicdn.com\/tfs\/TB1CDp8QFXXXXakXpXXXXXXXXXX-112-32.png"],"title":"\u6EE1299,\u4EAB\u90E8\u5206\u5730\u533A\u5305\u90AE"}],"showMianxiTips":false},"soldQuantity":{"confirmGoodsCount":"1452","soldTotalCount":"8863"},"promotion":{"promoData":{";20509:28315;1627207:28320;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28314;1627207:28338;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28315;1627207:28341;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],"def":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28314;1627207:28341;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28315;1627207:28338;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}],";20509:28314;1627207:28320;":[{"cart":true,"icon":"\/\/img.alicdn.com\/tfs\/TB1.KehskomBKNjSZFqXXXtqVXa-78-16.png","loginPromotion":false,"price":"96.00","start":false,"type":"\u6DD8\u62A2\u8D2D"}]},"saleDetailMap":{"result":"tqg","status":"online"}}}});
# """
# des_url = re.findall(r'"confirmGoodsCount":"(.*?)"',text)
# total = re.findall(r'"soldTotalCount":"(.*?)"', text)
# print(des_url[0],total[0])
taobao = TaobaoProcessor()
url = 'https://item.taobao.com/item.htm?spm=2013.1.w16867253-18554788179.1.7aa41c3dXWLKMm&id=556805373975'
# taobao.process(url)
# taobao.get_price(2448721589,573379814923)
# taobao.process_html(url)
taobao.taobao_spider(url)