python 抓取搜狗里的文章, 图片保存到七牛云

2018-11-23  本文已影响0人  app_developer

图片保存到七牛云的方法, 这边走http://lastidea.net/?p=7以下是代码

#-*- coding: UTF-8 -*-
import urllib, urllib2
from bs4 import BeautifulSoup
import socket
import requests
import datetime, time
import random
import os,stat,pwd
from qiniu import Auth
from qiniu import BucketManager


# 七牛云储储
def qiniu(url, key):
    access_key = ""  #yours access_key
    secret_key = ""  #yours secret_key
    bucket_name = ''  #yours bucket_name

    q = Auth(access_key, secret_key)
    bucket = BucketManager(q)

    ret, info = bucket.fetch(url, bucket_name, key)
    #assert ret['key'] == key

def go():
    # 设置代理访问
    url = 'http://www.xicidaili.com/nn/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }
    ip_list = get_ip_list(url, headers=headers)
    proxies = get_random_ip(ip_list)

    enable_proxy = True
    proxy_handler = urllib2.ProxyHandler(proxies)
    null_proxy_handler = urllib2.ProxyHandler({})
    if enable_proxy:
        opener = urllib2.build_opener(proxy_handler)
    else:
        opener = urllib2.build_opener(null_proxy_handler)
        urllib2.install_opener(opener)

    for i in [0, 1, 2, 4, 5, 6, 7, 8]: #
        url = "http://weixin.sogou.com/pcindex/pc/pc_"+str(i)+"/pc_"+str(i)+".html"
        #print url
        #http://weixin.sogou.com/pcindex/pc/pc_1/pc_1.html
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
        response = urllib2.urlopen(request, timeout=120)
        doc = response.read()
        soup = BeautifulSoup(''.join(doc), "html.parser")
        #print soup
        a = soup.find("a")
        href = a.get("href")
        img = soup.find("img")
        src = img.get("src")
        # print href;
        try:
            fileName = creatFileName("jpeg")
            
            #print src;
            if not src.startswith("http"):
                src = "http:" + src;
            
            #抓取图片到七牛云
            qiniu(src, fileName);
        except:
            pass
            continue
        time.sleep(3)
        getContent(href, i, qiniu_server_url + fileName)

def post(body=None):
    url = "http://test.lastidea.com/Admin/SystemArticle/add"  // your url
    #url = "http://localhost:8091/Admin/ArticleAdd/add"

    headers = {"Content-type": "application/x-www-form-urlencoded"}

    response = requests.post(url, data=body, headers=headers)
    #print response.text

def getContent(url, sogouClassId, src):
    socket.setdefaulttimeout(100)

    #print "---------";
    request = urllib2.Request(url)
    request.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
    request.add_header('Referer','https://mp.weixin.qq.com/')
    response = urllib2.urlopen(request)
    doc = response.read()
    #print "---------";
    #print doc;
    soup = BeautifulSoup(''.join(doc), "html.parser")

    title = soup.find(id="activity-name")

    #print title
    imgs = soup.find(id="js_content").findAll("img")
    #print imgs
    for img in imgs:
        imgDataSrc = img.get('data-src')
        imgType = img.get('data-type')
        if imgDataSrc:
            if not imgDataSrc.startswith("http"):
                imgDataSrc = "http:" + imgDataSrc;
                
            fileName = creatFileName(imgType)
            count = 1
            while count <= 3:
                try:
                    qiniu(imgDataSrc, fileName)
                    break
                except socket.timeout:
                    err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
                    print(err_info)
                    count += 1

            img['data-src'] = qiniu_server_url + fileName
            img['data-original'] = qiniu_server_url + fileName
            img['src'] = qiniu_server_url + "loading.gif" # 正在加载图片
            img['class'] = "lazy"
            #time.sleep(1)
        else:
            pass

    # 组织post数据
    if sogouClassId == 1: articleClassId = 17 #热门
    elif sogouClassId == 4: articleClassId = 16 #八卦精
    elif sogouClassId == 0: articleClassId = 10 #搞笑
    elif sogouClassId == 8: articleClassId = 11 #爱生活
    elif sogouClassId == 7: articleClassId = 12 #汽车迷
    elif sogouClassId == 6: articleClassId = 13 #财经类
    elif sogouClassId == 5: articleClassId = 14 #科技咖
    elif sogouClassId == 2: articleClassId = 15 #养生堂


    jsContent = soup.select("#activity-name, #js_content")
    jsContent = jsContent[0].__str__() + jsContent[1].__str__()

    body = {
        "title" : title.getText().strip(),
        "articleClassId" : articleClassId,
        "img" : src,
        "content" : jsContent,
        "attr[]" : 1,
        "click" : random.randint(10000, 100000)
    }
    #print body
    
    post(body=body)

# 获取代理IP
def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, "html.parser")
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list

# 随机得到一个代理IP
def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies

    #print(proxies)

def creatFileName(ext = "png"):
    return str(int(round(time.time() * 1000))) + str(random.randint(10000,99999)) + "." + str(ext)

# 七牛云外链服务器地址
qiniu_server_url = "http://ph4xfr5l1.bkt.clouddn.com"
go()
上一篇下一篇

猜你喜欢

热点阅读