爬取【京客隆超市】店铺信息

2020-08-24  本文已影响0人  JWLee

1.导入所需库

import requests

import pandas as pd

from lxml import etree

2.爬取各区链接

url = 'www.jkl.com.cn/cn/shop.asp…headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}response = requests.get(url,headers=headers).texthtml = etree.HTML(response)city_name = html.xpath('//div[@class="infoLis"]//a/text()')city_name = [i.strip() for i in city_name]city_url = html.xpath('//div[@class="infoLis"]//a/@href')city_url = ['http://www.jkl.com.cn/cn/' + i  for i in city_url]

3.当只存在一个大区需要翻页时,数据存储

for i in city_url:    if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865':        for a in range(1,4):            date = {                '__EVENTTARGET': 'AspNetPager1',                '__EVENTARGUMENT': a            }            response3 = requests.post(url = i, data=date,headers=headers).text            html2 = etree.HTML(response3)            city_shop_name = html2.xpath('//span[@class="con01"]/text()')              city_shop_dis = html2.xpath('//span[@class="con02"]/text()')            city_shop_phone = html2.xpath('//span[@class="con03"]/text()')            city_shop_time = html2.xpath('//span[@class="con04"]/text()')            shop_name = [d.strip()  for d in city_shop_name]            print(shop_name)            print('*'*30)            date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time})            date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")    else:        response1 =  requests.post(url=i,headers=headers).text        html1 = etree.HTML(response1)        city_shop_name1 = html1.xpath('//span[@class="con01"]/text()')        city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()')        city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()')        city_shop_time1 = html1.xpath('//span[@class="con04"]/text()')        shop_name1 = [c.strip()  for c in city_shop_name1]          print(shop_name1)      #数据存储    date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1})    date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")复制代码

#完整代码

#爬取【京客隆超市】店铺信息

import requests

import pandas as pd

from lxml import etree

url = 'http://www.jkl.com.cn/cn/shop.aspx'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}

response = requests.get(url,headers=headers).text

html = etree.HTML(response)

city_name = html.xpath('//div[@class="infoLis"]//a/text()')

city_name = [i.strip() for i in city_name]

city_url = html.xpath('//div[@class="infoLis"]//a/@href')

city_url = ['http://www.jkl.com.cn/cn/' + i  for i in city_url]

for i in city_url:

    if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865':

        for a in range(1,4):

            date = {

                '__EVENTTARGET': 'AspNetPager1',

                '__EVENTARGUMENT': a

            }

            response3 = requests.post(url = i, data=date,headers=headers).text

            html2 = etree.HTML(response3)

            city_shop_name = html2.xpath('//span[@class="con01"]/text()') 

            city_shop_dis = html2.xpath('//span[@class="con02"]/text()')

            city_shop_phone = html2.xpath('//span[@class="con03"]/text()')

            city_shop_time = html2.xpath('//span[@class="con04"]/text()')

            shop_name = [d.strip()  for d in city_shop_name]

            print(shop_name)

            print('*'*30)

            date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time})

            date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")

    else:

        response1 =  requests.post(url=i,headers=headers).text

        html1 = etree.HTML(response1)

        city_shop_name1 = html1.xpath('//span[@class="con01"]/text()')

        city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()')

        city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()')

        city_shop_time1 = html1.xpath('//span[@class="con04"]/text()')

        shop_name1 = [c.strip()  for c in city_shop_name1] 

        print(shop_name1) 

    #数据存储

    date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1})

    date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")

#如果区域内店铺不止一页,且只有一页时

作者:Jw__L

链接:https://juejin.im/post/6864348048642801672

来源:掘金

著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

上一篇下一篇

猜你喜欢

热点阅读