python爬取优惠信息

2017-09-05 本文已影响114人困困harper

大部分银行的官网上都会公布促销信息，有些银行做的很规范，有些做的非常混乱，今天通过python爬取一家相对比较规范的银行促销信息，如下图：

当我看到这个促销信息，我内心是高兴的，特别的规整。

我通过chrome浏览器可以查看此页面源码，通过检测发现所有数据都是由ajax获取json来动态加载，这样就省去了遍历页面的麻烦，可以直接解析json数据。

首先我们通过chrome检测功能中的network来拦截消息，如下图

我发现（https://creditcard.cmbc.com.cn/fe/getType.gsp?city=%E5%8C%97%E4%BA%AC%E5%B8%82），这个请求的response会返回json，我通过这个请求获取商区信息。然后通过请求（https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=1&dictrict=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=16&typeName=）其中包含了城市ID、商区ID、页码、行数，这里我指定每页行数为10，通过这个请求会拿到当前页的所有促销信息和总行数。我根据总行数和每页10行，计算出需要翻页次数。最后进入促销信息介绍页面，如下图：

同样json格式获取信息：

整体代码如下：

# 导入包
import os
import requests
import json
from lxml import etree
from multiprocessing import Pool

def getHtml(url):
    # 定义req为一个requests请求的对象
    req = requests.get(url)
    # req这个请求对象的status_code方法获取请求的状态码
    status_code = req.status_code
    if (status_code!= 200):
        return 'req_error'
    # print(status_code)
    # 指定网页解码方式
    req.encoding = 'utf-8'
    # 获取网页源码 用html变量接收 text content方法灵活运用
    # https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=1&dictrict=&kindName=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=16&typeName=
    html = req.text
    return html


#请求url
def get_fenye(url):
    html = getHtml(url)
    # print(html.strip())
    if(html!='req_error'):
        json_citys = json.loads(html.strip())
        # json_citys = [{'tvalue': '重庆', 'tkey': '28', 'py': 'Z'}]
        # print(type(json_citys))
        for json_city in json_citys:
            # print(json_city['tkey'],json_city['tvalue'],json_city['py'])
            area_url = 'https://creditcard.cmbc.com.cn/fe/getType.gsp?city=%s' % (json_city['tvalue'])
            html2 = getHtml(area_url)
            # print(html2.strip())
            if (html2 != 'req_error'):
                json_areas = json.loads(html2.strip())
                # print(type(json_areas))
                if( isinstance(json_areas, list)):
                    businessAreaLists = json_areas[0]['cityList'][0]['businessAreaList']
                    # print(type(businessAreaLists),businessAreaLists)
                    for businessAreaList in businessAreaLists:
                        # print(json_city['tvalue'],businessAreaList['businessAreaId'],businessAreaList['businessAreaName'])
                        cx_url = 'https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=%s&dictrict=%s&kindName=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=10&typeName=' % (json_city['tkey'], businessAreaList['businessAreaId'])
                        # print(cx_url)
                        # https: // creditcard.cmbc.com.cn / fe / common / shop - Business - info.jsp?shopid = 107141

                        html3 = getHtml(cx_url)
                        if (html3 != 'req_error'):
                            json_cxs = json.loads(html3.strip())
                            row_cnt = json_cxs[0]['rowCount']
                            # print(row_cnt, int(row_cnt / 10))
                            if(row_cnt>0):
                                for num in range(1,int(row_cnt / 10)+2):
                                    # print(num)
                                    fy_url = 'https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=%s&dictrict=%s&kindName=&latitude=&longitude=&mark=pc&page=%d&resolving=&rows=10&typeName=' % (json_city['tkey'], businessAreaList['businessAreaId'],num)
                                    print(fy_url)
                                    html4 = getHtml(fy_url)
                                    if (html4 != 'req_error'):
                                        json_cxsps = json.loads(html4.strip())
                                        ShopLists = json_cxsps[0]['ShopList']
                                        # print(type(ShopLists),ShopLists)
                                        for ShopList in ShopLists:
                                            tcity = ShopList['tcity']
                                            tlongitude = ShopList['tlongitude']
                                            tstatus = ShopList['tstatus']
                                            tshopId = ShopList['tshopId']
                                            timgType = ShopList['timgType']
                                            tlatitude = ShopList['tlatitude']
                                            tmerchName = ShopList['tmerchName']
                                            taddress = ShopList['taddress']
                                            # timgName = ShopList['timgName']
                                            tdiscount = ShopList['tdiscount']
                                            # https://creditcard.cmbc.com.cn/fe/find/fingShopByShopName.gsp?mark =pc&resolving=&rowser=&rowserEdition=&shopId=107141
                                            shop_url = 'https://creditcard.cmbc.com.cn/fe/find/fingShopByShopName.gsp?mark =pc&resolving=&rowser=&rowserEdition=&shopId=%s' % (tshopId)
                                            html5 = getHtml(shop_url)
                                            if (html5 != 'req_error'):
                                                json_shops = json.loads(html5.strip())
                                                # print(json_shops)
                                                print(shop_url)
                                                ShopDetailList = json_shops[0]['ShopDetailList'][0]
                                                timgName = json_shops[0]['ShopImgList'][0]['timgName']
                                                # https://creditcard.cmbc.com.cn/
                                                # print(ShopDetailList)
                                                print(json_city['tvalue'],businessAreaList['businessAreaName'],tcity, tlongitude, tstatus, tshopId, timgType, tlatitude, tmerchName, taddress,
                                                      timgName, tdiscount,timgName)
                                                # print(ShopImgList)
                                                tbriefInfoAdv = ShopDetailList['tbriefInfoAdv']
                                                ditails = ShopDetailList['ditail']
                                                content = '名称：' + tmerchName + '\n' + '地址：' + taddress + '\n' + '优惠信息：'+ tdiscount + '\n'

                                                for ditail in ditails:
                                                    # print(ditail['key'],ditail['value'])
                                                    content = content + ditail['key'] + '：' + ditail['value'] + '\n'
                                                print('--------------------------------------')
                                                content = content + '图片地址：' + 'https://creditcard.cmbc.com.cn/'+timgName + '\n' + '--------------------------------------' + '\n'
                                                writeTxt('cmbc',json_city['tvalue'], businessAreaList['businessAreaName'], content)


def writeTxt(bank_name,menu_name,short_name,content):
    base_dir = os.path.abspath(__file__)
    parent_dir = os.path.dirname(base_dir)
    menu_dir = os.path.join(parent_dir, bank_name, menu_name)
    if os.path.isdir(menu_dir):
        pass
    else:
        os.makedirs(menu_dir)
    os.chdir(menu_dir)
    file_name = os.path.join(menu_dir, short_name + '.txt')
    with open(file_name, 'a', encoding='utf-8') as file:
        file.write(content)


if __name__ == '__main__':
    root_url = 'https://creditcard.cmbc.com.cn/fe/getCityList.gsp'
    get_fenye(root_url)

运行结果：

后续可以加上线程池方式来加速爬取优惠信息。。。。。。。。。。

1，增加状态返回码，单进程爬取时候发现报错，但是重复爬有没有错误信息，所以增加响应码。

2，将爬取信息写到文件中，按城市划分目录，地区来命名。

3，

python爬取优惠信息

猜你喜欢

热点阅读