爬虫大数据 爬虫Python AI SqlPython数据采集与爬虫

按关键词抓取mm29妹子图片(BeautifulSoup,url

2017-12-03  本文已影响21人  安和然

前一篇文章介绍了下载全站图片,有同学问,能不能只下载某一个关键词的。

这个要灵活一点,不用框架,直接用urllib。

#coding=utf-8
from bs4 import BeautifulSoup
import urllib
import requests
import os
from multiprocessing.pool import Pool
import multiprocessing

def getAllUrl(url,m,n): #m为首页码,初始为2,n为终页码,为227
    urllist = []
    for x in range(m,n + 1):
        print("get page" + str(x))
        html_doc = urllib.urlopen(url + str(x) ).read().decode('utf-8')
        soup = BeautifulSoup(html_doc, "lxml")
        mm = soup.find("div", id='maincontent')
        my = mm.find_all("a", target = "_blank")
        for u in my:
            if len(u.get("href")) < 40:
                urllist.append(u.get("href"))
    return list(set(urllist))

def downlaodimg(urls):
    girls = len(urls)
    for i in range(0,girls):
        html_doc = urllib.urlopen(urls[i]).read().decode('utf-8')
        soup = BeautifulSoup(html_doc, "lxml")
        for imgs in soup.find_all('p'):
            imglist = imgs.find_all('img')
            for u in imglist:
                try:
                    img_src = u.get("src")
                    pic_name = (u.get("alt"))+ "1" + '.jpg'
                    urllib.urlretrieve(img_src, pic_name)
                    print(u.get("alt"))
                except:
                    continue
    girls += 1

#这里可以自主指定关键词,需要的起始页和终止页,
allpage = getAllUrl("http://www.mm29.com/tag/甜美/",2,4)
print(allpage)
print("get page" +str(len(allpage)))
downlaodimg(allpage)
上一篇下一篇

猜你喜欢

热点阅读