大众点评页面抓取实例

2017-01-01  本文已影响0人  up_shang

#coding:utf-8

import re

from bs4 import BeautifulSoup as bs

with open('dianping.html','rb') as f:

html = f.read().decode()

dianping = bs(html,'lxml')

allshops = dianping.find_all('div', attrs={'class':'shop-list J_shop-list shop-all-list'})[0]

shops = allshops.find_all('li')

for eachshop in shops:

name = eachshop.h4.string

shopurl = eachshop.a["href"]

try:

star = re.findall('title="(.*)">',str(eachshop.find_all('span')[0]))[0]

except:

star = ''

try:

cls = re.findall('(.*?)',str(eachshop.find_all('span')))[0]

except:

cls = ''

try:

area = re.findall('(.*?)',str(eachshop.find_all('span')))[1]

except:

area = ''

try:

addr = re.findall('(.*?)',str(eachshop.find_all('span')))[0]

except:

addr = ''

try:

comments = re.findall('(.*?)',str(eachshop.find_all('b')[0]))[0]

except:

comments = ''

try:

mean = re.findall('(.*?)',str(eachshop.find_all('b')[1]))[0]

except:

mean = ''

try:

taste = re.findall('(.*?)',str(eachshop.find_all('b')[2]))[0]

except:

taste = ''

try:

envior = re.findall('(.*?)',str(eachshop.find_all('b')[3]))[0]

except:

envior = ''

try:

service = re.findall('(.*?)',str(eachshop.find_all('b')[4]))[0]

except:

service = ''

print (name,shopurl,star,cls,area,addr,mean,taste,envior,service,comments)

上一篇 下一篇

猜你喜欢

热点阅读