第一周大作业-爬取58同城商品信息
2016-07-01 本文已影响0人
碾香年年念
运行结果
![](https://img.haomeiwen.com/i2367408/544825ef5cc961ef.png)
代码部分
from bs4 import BeautifulSoup
import requests,urllib.request
import time
headers = {
'User-Agent' : 'Mozilla / 5.0(Windows NT 10.0;Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 51.0.2704.103 Safari / 537.36'
}
urls = ['http://bj.58.com/pbdn/1/pn{}/?PGTID=0d305a36-0000-1b3b-1598-57f0dc305892&ClickID=1'.format(i) for i in range(1,2)]
def get_link(url):
wb_page = requests.get(url, headers=headers)
sou = BeautifulSoup(wb_page.text, 'lxml')
links = sou.select('tr > td.t > a.t')
wholelinks = []
for link in links:
wholelinks.append(link.get('href'))
for slink in wholelinks:
analy(slink)
def get_clicks(url):
id = url.split('/')[-1].split('x')[0]
address = 'http://jst1.58.com/counter?infoid={}'.format(id)
js = requests.get(address)
clicks = js.text.split('=')[-1]
return clicks
def analy(wblink):
wb_data = requests.get(wblink, headers=headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
cat = soup.select('#header > div.breadCrumb.f12 > span > a')
cates = [cat[-1]]
titles = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1')
times = soup.select('#index_show > ul.mtit_con_left.fl > li.time')
prices = soup.select(
'#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span.price.c_f50')
cond = soup.select(
'div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span')
conditions = [cond[1]]
places = soup.select(
'div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span > a')
wholeplace = []
for place in places:
wholeplace.append(place.get_text())
for condition in conditions:
tet = condition.get_text()
realcod = tet.split('\t')[1].split('\r')[0]
for cate, title, tim, price, condition in zip(cates, titles, times, prices, conditions):
dat = {
'cate': cate.get_text(),
'title': title.get_text(),
'times': tim.get_text(),
'price': price.get_text(),
'condition': realcod,
'place': wholeplace,
'click':get_clicks(wblink)
}
print(dat)
for single_url in urls:
get_link(single_url)
总结
- 关于点击量的爬取还是不行,不知道为什么,用视频中的JS还是不能爬取。
- 有关爬取的路径还可以简化
- 有关地址,有些没有,有些有一个,有些有两个,似乎我的处理办法还可以变简单一些