Python四期爬虫作业

python爬取优惠信息

2017-09-05  本文已影响114人  困困harper

大部分银行的官网上都会公布促销信息,有些银行做的很规范,有些做的非常混乱,今天通过python爬取一家相对比较规范的银行促销信息,如下图:

当我看到这个促销信息,我内心是高兴的,特别的规整。

我通过chrome浏览器可以查看此页面源码,通过检测发现所有数据都是由ajax获取json来动态加载,这样就省去了遍历页面的麻烦,可以直接解析json数据。

首先我们通过chrome检测功能中的network来拦截消息,如下图

我发现(https://creditcard.cmbc.com.cn/fe/getType.gsp?city=%E5%8C%97%E4%BA%AC%E5%B8%82),这个请求的response会返回json,我通过这个请求获取商区信息。然后通过请求(https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=1&dictrict=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=16&typeName=)其中包含了城市ID、商区ID、页码、行数,这里我指定每页行数为10,通过这个请求会拿到当前页的所有促销信息和总行数。我根据总行数和每页10行,计算出需要翻页次数。最后进入促销信息介绍页面,如下图:

同样json格式获取信息:

整体代码如下:

# 导入包
import os
import requests
import json
from lxml import etree
from multiprocessing import Pool

def getHtml(url):
# 定义req为一个requests请求的对象
req = requests.get(url)
# req这个请求对象的status_code方法获取请求的状态码
status_code = req.status_code
if (status_code!= 200):
return 'req_error'
# print(status_code)
# 指定网页解码方式
req.encoding = 'utf-8'
# 获取网页源码 用html变量接收 text content方法灵活运用
# https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=1&dictrict=&kindName=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=16&typeName=
html = req.text
return html


#请求url
def get_fenye(url):
html = getHtml(url)
# print(html.strip())
if(html!='req_error'):
json_citys = json.loads(html.strip())
# json_citys = [{'tvalue': '重庆', 'tkey': '28', 'py': 'Z'}]
# print(type(json_citys))
for json_city in json_citys:
# print(json_city['tkey'],json_city['tvalue'],json_city['py'])
area_url = 'https://creditcard.cmbc.com.cn/fe/getType.gsp?city=%s' % (json_city['tvalue'])
html2 = getHtml(area_url)
# print(html2.strip())
if (html2 != 'req_error'):
json_areas = json.loads(html2.strip())
# print(type(json_areas))
if( isinstance(json_areas, list)):
businessAreaLists = json_areas[0]['cityList'][0]['businessAreaList']
# print(type(businessAreaLists),businessAreaLists)
for businessAreaList in businessAreaLists:
# print(json_city['tvalue'],businessAreaList['businessAreaId'],businessAreaList['businessAreaName'])
cx_url = 'https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=%s&dictrict=%s&kindName=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=10&typeName=' % (json_city['tkey'], businessAreaList['businessAreaId'])
# print(cx_url)
# https: // creditcard.cmbc.com.cn / fe / common / shop - Business - info.jsp?shopid = 107141

html3 = getHtml(cx_url)
if (html3 != 'req_error'):
json_cxs = json.loads(html3.strip())
row_cnt = json_cxs[0]['rowCount']
# print(row_cnt, int(row_cnt / 10))
if(row_cnt>0):
for num in range(1,int(row_cnt / 10)+2):
# print(num)
fy_url = 'https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=%s&dictrict=%s&kindName=&latitude=&longitude=&mark=pc&page=%d&resolving=&rows=10&typeName=' % (json_city['tkey'], businessAreaList['businessAreaId'],num)
print(fy_url)
html4 = getHtml(fy_url)
if (html4 != 'req_error'):
json_cxsps = json.loads(html4.strip())
ShopLists = json_cxsps[0]['ShopList']
# print(type(ShopLists),ShopLists)
for ShopList in ShopLists:
tcity = ShopList['tcity']
tlongitude = ShopList['tlongitude']
tstatus = ShopList['tstatus']
tshopId = ShopList['tshopId']
timgType = ShopList['timgType']
tlatitude = ShopList['tlatitude']
tmerchName = ShopList['tmerchName']
taddress = ShopList['taddress']
# timgName = ShopList['timgName']
tdiscount = ShopList['tdiscount']
# https://creditcard.cmbc.com.cn/fe/find/fingShopByShopName.gsp?mark =pc&resolving=&rowser=&rowserEdition=&shopId=107141
shop_url = 'https://creditcard.cmbc.com.cn/fe/find/fingShopByShopName.gsp?mark =pc&resolving=&rowser=&rowserEdition=&shopId=%s' % (tshopId)
html5 = getHtml(shop_url)
if (html5 != 'req_error'):
json_shops = json.loads(html5.strip())
# print(json_shops)
print(shop_url)
ShopDetailList = json_shops[0]['ShopDetailList'][0]
timgName = json_shops[0]['ShopImgList'][0]['timgName']
# https://creditcard.cmbc.com.cn/
# print(ShopDetailList)
print(json_city['tvalue'],businessAreaList['businessAreaName'],tcity, tlongitude, tstatus, tshopId, timgType, tlatitude, tmerchName, taddress,
timgName, tdiscount,timgName)
# print(ShopImgList)
tbriefInfoAdv = ShopDetailList['tbriefInfoAdv']
ditails = ShopDetailList['ditail']
content = '名称:' + tmerchName + '\n' + '地址:' + taddress + '\n' + '优惠信息:'+ tdiscount + '\n'

for ditail in ditails:
# print(ditail['key'],ditail['value'])
content = content + ditail['key'] + ':' + ditail['value'] + '\n'
print('--------------------------------------')
content = content + '图片地址:' + 'https://creditcard.cmbc.com.cn/'+timgName + '\n' + '--------------------------------------' + '\n'
writeTxt('cmbc',json_city['tvalue'], businessAreaList['businessAreaName'], content)


def writeTxt(bank_name,menu_name,short_name,content):
base_dir = os.path.abspath(__file__)
parent_dir = os.path.dirname(base_dir)
menu_dir = os.path.join(parent_dir, bank_name, menu_name)
if os.path.isdir(menu_dir):
pass
else:
os.makedirs(menu_dir)
os.chdir(menu_dir)
file_name = os.path.join(menu_dir, short_name + '.txt')
with open(file_name, 'a', encoding='utf-8') as file:
file.write(content)


if __name__ == '__main__':
root_url = 'https://creditcard.cmbc.com.cn/fe/getCityList.gsp'
get_fenye(root_url)

运行结果:

后续可以加上线程池方式来加速爬取优惠信息。。。。。。。。。。

1,增加状态返回码,单进程爬取时候发现报错,但是重复爬有没有错误信息,所以增加响应码。

2,将爬取信息写到文件中,按城市划分目录,地区来命名。

3,

上一篇下一篇

猜你喜欢

热点阅读