2_2抓取手机号_笔记

2016-12-03  本文已影响0人  蜂DAO

最终效果

Paste_Image.png

我的代码

from bs4 import BeautifulSoup
import requests
import time
import pymongo

client = pymongo.MongoClient('localhost',27017)
homework = client['homework']
work2_2 = homework['work2_2']
work2_2con = homework['work2_2con']

#生成列表页链接

url = 'http://bj.58.com/shoujihao/pn1/'

#infolist > div > ul > div > ul > li:nth-child(2) > a.t
#爬取手机号链接


#爬取页面上的号码
def get_links(page):
    wb_data = requests.get(page)
    Soup = BeautifulSoup(wb_data.text,'lxml')
    links = Soup.select('.boxlist > ul > li > a.t ')
    numbers = Soup.select('.boxlist > ul > li > a.t > strong ')
    for link,number in zip(links,numbers):
        link = link.get('href')
        number = number.get_text()
        data = {
            "link"   : link,
            "number" : number
        }
        print(data,'\n---------------------\n')
        work2_2.insert_one(data)

# 爬取指数页面范围上的号码链接
def get_pageUrl(num):
    urls = ['http://bj.58.com/shoujihao/pn{}/'.format(i) for i in range(1,num)]
    for url in urls:
        print(url)
        get_links(url)
        time.sleep(0.5)

#抓取50个页面上的号码和链接
#get_pageUrl(50)

#挑选出链接中含有'bj.58.com'的正常内

for item in work2_2.find({'link':{'$regex':'bj.58.com'}}):
    print(item['link'],item['number'])
    wb_data = requests.get(item['link'])
    Soup = BeautifulSoup(wb_data.text, 'lxml')
    title = Soup.select('div.col.detailPrimary.mb15 > div.col_sub.mainTitle > h1')[0].get_text().strip()
    price = Soup.select('div.col.detailPrimary.mb15 > div.col_sub.sumary > ul > li > div.su_con > span')[0].get_text().strip()
    data = {
        "title" : title,
        "price" : price,
        "number" : item['number'],
        "link"   : item['link']
    }
    work2_2con.insert_one(data)
    print(data)
    time.sleep(0.5)


学到的知识

例:
for item in work2_2.find({'link':{'$regex':'bj.58.com'}}):
    print(item['link'])
上一篇下一篇

猜你喜欢

热点阅读