Python实战计划学习笔记(7)爬取58同城信息
2016-08-27 本文已影响0人
如恒河沙
任务简述
- 爬取58同城某页面列表页中,除了转转、推广商品之外的正常商品
- 要求爬取的商品信息包括:
- 类目
- 标题
- 发帖时间
- 价格
- 成色
- 区域
任务分析
- 静态加载的网页
- 翻页靠 http://bj.58.com/pbdn/0/pn1/ 中pn后面的数字控制
- 推广商品的路径是
#jingzhun > tbody > tr:nth-child(1) > td.t > a.t
#jingzhun > tbody > tr:nth-child(2) > td.t > a.t
#jingzhun > tbody > tr:nth-child(2) > td.t > a.t
- 转转商品的路径是:
#infolist > div.infocon > table > tbody > tr:nth-child(1) > td.t > a
#infolist > div.infocon > table > tbody > tr:nth-child(2) > td.t > a
#infolist > div.infocon > table > tbody > tr:nth-child(3) > td.t > a
- 正常商品的路径是:
#infolist > div.infocon > table > tbody > tr:nth-child(6) > td.t > a.t
我的代码
from bs4 import BeautifulSoup
import requests,urllib.request
import time
url = 'http://bj.58.com/pbdn/0/'
urls = ['http://bj.58.com/pbdn/0/pn{}/'.format(str(i)) for i in range(2,3,1)]
urls.insert(0,url)
def get_page(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
items = soup.select('div.infocon > table > tbody > tr > td.t > a')
return(items)
def find_normal_items(items):
normal_item_links = []
for item in items:
if item.get('href')[7:17] == 'zhuanzhuan':
continue
else:
normal_item_links.append(item.get('href'))
return(normal_item_links)
def get_details(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
cates = soup.select('div.breadCrumb.f12 > span:nth-of-type(3) > a')
titles = soup.select('div.col_sub.mainTitle > h1')
dates = soup.select('ul.mtit_con_left.fl > li.time')
prices = soup.select('div.col_sub.sumary > ul > li > div.su_con > span')
chengses = soup.select('div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')
areas = soup.select('div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')
for cate, title,date,price,chengse,area in zip(cates,titles,dates,prices,chengses,areas):
data = {
'cate':cate.get_text(),
'title':title.get_text(),
'date':date.get_text(),
'price':price.get_text(),
'chengse':chengse.get_text().split('\t')[1][:-2],
'area':area.get_text().strip('\t\n').replace('\t','').replace('\r\n','')
}
return(data)
all_normal_items_links = []
n = 0
for single_url in urls:
data = find_normal_items(get_page(single_url))
all_normal_items_links = all_normal_items_links + data
n+=1
print('find',len(data),'normal items in page ',n)
time.sleep(2)
result = []
for single_url in all_normal_items_links:
result.append(get_details(all_normal_items_links[0]))
time.sleep(2)
print(result)
运行结果
1.jpg遗留问题
读取js加载的浏览量未成功,通过python读取js的api如下:
http://jst1.58.com/counter?infoid=21972416366734
返回结果为:
Counter58.userlist[0]={uid:'0',uname:'',face:'',vt:''};Counter58.total=0
但通过刷新网页返回结果为:
Counter58.userlist[0]={uid:'0',uname:'',face:'',vt:''};Counter58.total=3328
目前尚未找到两者返回值不同的原因
测试代码如下
test_url = 'http://bj.58.com/pingbandiannao/21972416366734x.shtml'
def get_view_count(url):
id = url.split('/')[-1].strip('x.shtml')
api = 'http://jst1.58.com/counter?infoid={}'.format(id)
js = requests.get(api)
views = js.text.split('=')[-1]
return views
print(get_view_count(test_url))