用Besoup爬虫爬取某个百度贴吧所有页码的图片
2017-05-14 本文已影响85人
NoValue
参考
可以参考:http://www.jianshu.com/p/45e13334a71f
我的是Soup版 爬取贴吧 某个帖子下面所有的图片链接
优点
- 不用输入页面 自动匹配到总共几页。
- 自动爬取该贴吧下的所有页的图片资源
- 自动重命名文件。通过命名可以看到是第几页的图片
效果图如下:
tiebapachong.png# -*- coding:utf-8 -*-
# **********************************
# ** http://weibo.com/lixiaodaoaaa #
# ****** by:lixiaodaoaaa ***********
# -*- coding: UTF-8 -*-
import urllib
import re
import sys
import json
import requests
from bs4 import BeautifulSoup, Tag
from datetime import datetime
import random
import time
def convertUrlToBeautifulSoup(url):
getStr = requests.get(url)
getStr.encoding = "utf-8"
return BeautifulSoup(getStr.text, "html.parser")
def download_img(beSoup, page):
for myImg in mySoup.select(".BDE_Image"):
imgUrl = myImg.get("src")
myTempStr = imgUrl.split("sign=")
strPage = "page_%d_fileName=" % page
strTime = "%s.jpg" % time.time()
fileName = strPage + strTime
print(fileName)
urllib.request.urlretrieve(imgUrl, fileName)
def getTotalNumber(beSoup):
page = beSoup.select(".l_reply_num")[0].select(".red")[1].text
return int(page)
def getParaUrl(sourUrl, page):
para = "?pn=%d" % page
hasParaUrl = startUrl + para
return hasParaUrl
if __name__ == '__main__':
startUrl = input('输入网址,把pn=后面的数字去掉')
mySoup = convertUrlToBeautifulSoup(startUrl)
totalPage = getTotalNumber(mySoup)
startPage = 2
download_img(mySoup, 1)
while startPage <= totalPage:
tempSoup = convertUrlToBeautifulSoup(getParaUrl(startUrl, startPage))
download_img(tempSoup, startPage)
startPage += 1