Python简单爬虫爬取虎扑社区福利gif图片
2018-02-22 本文已影响278人
盗花
以下程序亲测成功,重点代码都加了注释,就不一一介绍了。爬取结果各位自行体会。PS:爬取前先在当前文件夹建立hupu_gif的文件夹。
# coding: utf-8
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
import re
import os
url = 'https://my.hupu.com/search?q=%E7%A6%8F%E5%88%A9' # 虎扑搜索"福利"出来的页面,注意q=后面的字符表示"福利"两个字
htmls = set() # 用于去掉重复的网页地址
startPage = 11 # 开始页,可自行调整
endPage = 20 # 结束页,可自行调整。搜索结果总共有1000页,不能超过1000
pages = range(startPage, endPage + 1)
count = 0
for page in pages:
url_page = url + '&page=' + str(page) # 每个具体的搜索页
html = urlopen(url_page)
bsObj = BeautifulSoup(html, 'lxml')
tds = bsObj.findAll('td', {'class': 'p_title'}) # 找到包含正确链接的td
for td in tds:
if td.a.attrs['href']:
td_href = td.a.attrs['href']
print('td_href=>', td_href)
try:
html_each = urlopen(td_href)
except Exception as e:
print(e)
print('出错了,继续抓取下一个链接')
continue
bsObj_each = BeautifulSoup(html_each, 'lxml')
gifs = bsObj_each.findAll('img', src=re.compile(r'.*gif')) # 找到后缀为gif的动图
if gifs:
for gif in gifs:
gif_href = gif.attrs['src']
gif_href = re.match(r'.*gif', gif_href).group() # 去掉'.gif'后不必要的内容
if gif_href not in htmls:
htmls.add(gif_href)
print(gif_href)
try:
local_filename, headers = urlretrieve(gif_href, filename='./hupu_gif/{}.gif'.format(count))
except Exception as e:
print('出错了=>', e)
continue
if os.path.getsize(local_filename) >= 10000: # 大于10KB的gif文件保留
count += 1
else: # 否则删除掉
os.remove(local_filename)
print('Done!')