程序员

宅男福利 用Python爬取美女图片

2021-01-04  本文已影响0人  Miku丨无形

嘿嘿 召唤老色批
今天带大家爬去一下美女的图片

用的是requests和xpath去解析

获取网页和解析网页的函数

def get_tag(response,tag):
    html=etree.HTML(response)
    ret=html.xpath(tag)
    return ret

def parse_url(url):
    response=requests.get(url,headers=headers)
    return response.text

获取网页url

def url_find(url):
    r=parse_url(url)
    url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
    title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
    # print(len(url_list))
    for i in range(len(url_list)):
        url_jpg_find(url_list[i],title[i])
        print(title,'保存完毕')

获取图片的url

def url_jpg_find(url,title):
    global page
    page=0
    r=parse_url(url)
    url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
    url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
    if not os.path.exists(title):
        os.makedirs(title)
    # else:
    #     return
    for i in url_list:
        content_find(i,title)
        # break

获取图片的信息

def content_find(url,title):
    # print(url)
    r=parse_url(url)
    # print(r)
    name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
    url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
    # print(name,url_jpg)
    time.sleep(0.2)
    save(name,url_jpg,title)

保存图片

def save(name,url_jpg,title):
    global page
    r=requests.get(url_jpg,headers=headers)
    with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
        j.write(r.content)
    j.close()
    page+=1
    print(page)

import requests,os,time
from lxml import etree

headers={
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    "Referer" : "https://www.mzitu.com",
}

page=0

def get_tag(response,tag):
    html=etree.HTML(response)
    ret=html.xpath(tag)
    return ret

def parse_url(url):
    response=requests.get(url,headers=headers)
    return response.text

def url_find(url):
    r=parse_url(url)
    url_list=get_tag(r,'//*[@id="pins"]/li/span[1]/a/@href')
    title=get_tag(r, '//*[@id="pins"]/li/span[1]/a/text()')
    # print(len(url_list))
    for i in range(len(url_list)):
        url_jpg_find(url_list[i],title[i])
        print(title,'保存完毕')

def url_jpg_find(url,title):
    global page
    page=0
    r=parse_url(url)
    url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[4]/a[5]/span/text()')[0])
    url_list=[url]+[url + '/' + str(i) for i in range(2, url_last + 1)]
    if not os.path.exists(title):
        os.makedirs(title)
    # else:
    #     return
    for i in url_list:
        content_find(i,title)
        # break

def content_find(url,title):
    # print(url)
    r=parse_url(url)
    # print(r)
    name=get_tag(r,'/html/body/div[2]/div[1]/h2/text()')[0]
    url_jpg=get_tag(r,'//div[@class="main-image"]//a/img/@src')[0]
    # print(name,url_jpg)
    time.sleep(0.2)
    save(name,url_jpg,title)

def save(name,url_jpg,title):
    global page
    r=requests.get(url_jpg,headers=headers)
    with open(os.getcwd()+'/'+title+'/'+name+'.jpg','wb') as j:
        j.write(r.content)
    j.close()
    page+=1
    print(page)

def main():
    start_url='https://www.mzitu.com'
    r=parse_url(start_url)
    url_last=int(get_tag(r,'/html/body/div[2]/div[1]/div[3]/div/a[4]/text()')[0])
    url='https://www.mzitu.com/page/'
    url_list=['https://www.mzitu.com']+[url+str(i) for i in range(2,url_last+1)]
    # print(url_list)
    for url in url_list:
        url_find(url)
        # break


if __name__ == '__main__':
    main()

效果图就不放了
咳咳 太诱人 会被封掉
请大家自行脑补一下

一起学习python,小白指导,教学分享记得私信我

上一篇下一篇

猜你喜欢

热点阅读