python 抓取搜狗里的文章, 图片保存到七牛云
2018-11-23 本文已影响0人
app_developer
图片保存到七牛云的方法, 这边走http://lastidea.net/?p=7以下是代码
#-*- coding: UTF-8 -*-
import urllib, urllib2
from bs4 import BeautifulSoup
import socket
import requests
import datetime, time
import random
import os,stat,pwd
from qiniu import Auth
from qiniu import BucketManager
# 七牛云储储
def qiniu(url, key):
access_key = "" #yours access_key
secret_key = "" #yours secret_key
bucket_name = '' #yours bucket_name
q = Auth(access_key, secret_key)
bucket = BucketManager(q)
ret, info = bucket.fetch(url, bucket_name, key)
#assert ret['key'] == key
def go():
# 设置代理访问
url = 'http://www.xicidaili.com/nn/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
ip_list = get_ip_list(url, headers=headers)
proxies = get_random_ip(ip_list)
enable_proxy = True
proxy_handler = urllib2.ProxyHandler(proxies)
null_proxy_handler = urllib2.ProxyHandler({})
if enable_proxy:
opener = urllib2.build_opener(proxy_handler)
else:
opener = urllib2.build_opener(null_proxy_handler)
urllib2.install_opener(opener)
for i in [0, 1, 2, 4, 5, 6, 7, 8]: #
url = "http://weixin.sogou.com/pcindex/pc/pc_"+str(i)+"/pc_"+str(i)+".html"
#print url
#http://weixin.sogou.com/pcindex/pc/pc_1/pc_1.html
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
response = urllib2.urlopen(request, timeout=120)
doc = response.read()
soup = BeautifulSoup(''.join(doc), "html.parser")
#print soup
a = soup.find("a")
href = a.get("href")
img = soup.find("img")
src = img.get("src")
# print href;
try:
fileName = creatFileName("jpeg")
#print src;
if not src.startswith("http"):
src = "http:" + src;
#抓取图片到七牛云
qiniu(src, fileName);
except:
pass
continue
time.sleep(3)
getContent(href, i, qiniu_server_url + fileName)
def post(body=None):
url = "http://test.lastidea.com/Admin/SystemArticle/add" // your url
#url = "http://localhost:8091/Admin/ArticleAdd/add"
headers = {"Content-type": "application/x-www-form-urlencoded"}
response = requests.post(url, data=body, headers=headers)
#print response.text
def getContent(url, sogouClassId, src):
socket.setdefaulttimeout(100)
#print "---------";
request = urllib2.Request(url)
request.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
request.add_header('Referer','https://mp.weixin.qq.com/')
response = urllib2.urlopen(request)
doc = response.read()
#print "---------";
#print doc;
soup = BeautifulSoup(''.join(doc), "html.parser")
title = soup.find(id="activity-name")
#print title
imgs = soup.find(id="js_content").findAll("img")
#print imgs
for img in imgs:
imgDataSrc = img.get('data-src')
imgType = img.get('data-type')
if imgDataSrc:
if not imgDataSrc.startswith("http"):
imgDataSrc = "http:" + imgDataSrc;
fileName = creatFileName(imgType)
count = 1
while count <= 3:
try:
qiniu(imgDataSrc, fileName)
break
except socket.timeout:
err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
print(err_info)
count += 1
img['data-src'] = qiniu_server_url + fileName
img['data-original'] = qiniu_server_url + fileName
img['src'] = qiniu_server_url + "loading.gif" # 正在加载图片
img['class'] = "lazy"
#time.sleep(1)
else:
pass
# 组织post数据
if sogouClassId == 1: articleClassId = 17 #热门
elif sogouClassId == 4: articleClassId = 16 #八卦精
elif sogouClassId == 0: articleClassId = 10 #搞笑
elif sogouClassId == 8: articleClassId = 11 #爱生活
elif sogouClassId == 7: articleClassId = 12 #汽车迷
elif sogouClassId == 6: articleClassId = 13 #财经类
elif sogouClassId == 5: articleClassId = 14 #科技咖
elif sogouClassId == 2: articleClassId = 15 #养生堂
jsContent = soup.select("#activity-name, #js_content")
jsContent = jsContent[0].__str__() + jsContent[1].__str__()
body = {
"title" : title.getText().strip(),
"articleClassId" : articleClassId,
"img" : src,
"content" : jsContent,
"attr[]" : 1,
"click" : random.randint(10000, 100000)
}
#print body
post(body=body)
# 获取代理IP
def get_ip_list(url, headers):
web_data = requests.get(url, headers=headers)
soup = BeautifulSoup(web_data.text, "html.parser")
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text)
return ip_list
# 随机得到一个代理IP
def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies
#print(proxies)
def creatFileName(ext = "png"):
return str(int(round(time.time() * 1000))) + str(random.randint(10000,99999)) + "." + str(ext)
# 七牛云外链服务器地址
qiniu_server_url = "http://ph4xfr5l1.bkt.clouddn.com"
go()