python mongodb爬取58网站
2017-05-06 本文已影响10人
宁静消失何如
__author__ = 'Lee'
from bs4 import BeautifulSoup
import requests
'''
用这个爬取58中二手的分栏
'''
start_url = 'http://bj.58.com/sale.shtml' #开始的页面
url_host = 'http://bj.58.com'
def get_channel_urls(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
links = soup.select('ul.ym-submnu > li > b > a')
for link in links:
page_url = url_host + link.get('href')
print(page_url)