2_2抓取手机号_笔记
2016-12-03 本文已影响0人
蜂DAO
最终效果
Paste_Image.png我的代码
from bs4 import BeautifulSoup
import requests
import time
import pymongo
client = pymongo.MongoClient('localhost',27017)
homework = client['homework']
work2_2 = homework['work2_2']
work2_2con = homework['work2_2con']
#生成列表页链接
url = 'http://bj.58.com/shoujihao/pn1/'
#infolist > div > ul > div > ul > li:nth-child(2) > a.t
#爬取手机号链接
#爬取页面上的号码
def get_links(page):
wb_data = requests.get(page)
Soup = BeautifulSoup(wb_data.text,'lxml')
links = Soup.select('.boxlist > ul > li > a.t ')
numbers = Soup.select('.boxlist > ul > li > a.t > strong ')
for link,number in zip(links,numbers):
link = link.get('href')
number = number.get_text()
data = {
"link" : link,
"number" : number
}
print(data,'\n---------------------\n')
work2_2.insert_one(data)
# 爬取指数页面范围上的号码链接
def get_pageUrl(num):
urls = ['http://bj.58.com/shoujihao/pn{}/'.format(i) for i in range(1,num)]
for url in urls:
print(url)
get_links(url)
time.sleep(0.5)
#抓取50个页面上的号码和链接
#get_pageUrl(50)
#挑选出链接中含有'bj.58.com'的正常内
for item in work2_2.find({'link':{'$regex':'bj.58.com'}}):
print(item['link'],item['number'])
wb_data = requests.get(item['link'])
Soup = BeautifulSoup(wb_data.text, 'lxml')
title = Soup.select('div.col.detailPrimary.mb15 > div.col_sub.mainTitle > h1')[0].get_text().strip()
price = Soup.select('div.col.detailPrimary.mb15 > div.col_sub.sumary > ul > li > div.su_con > span')[0].get_text().strip()
data = {
"title" : title,
"price" : price,
"number" : item['number'],
"link" : item['link']
}
work2_2con.insert_one(data)
print(data)
time.sleep(0.5)
学到的知识
- pyMongo模糊查找:{'xxx':{'$regex':'xxx'}}
例:
for item in work2_2.find({'link':{'$regex':'bj.58.com'}}):
print(item['link'])
- 利用函数def来封装代码