python爬虫
2017-11-09 本文已影响0人
ZHQIAN
一、新闻爬虫实战(爬取新浪新闻首页所有新闻内容)
思路:
1、爬取新闻首页
2、得到各新闻链接
3、爬取新闻链接
4、寻找有没有frame
5、若有,抓取frame下对应的网页内容
6、若没有,直接抓取当前页面
例子程序:
import urllib.request
import re
url = "http://news.sina.com.cn/"
data = urllib.request.urlopen(url).read().decode("UTF-8","ignore")
pat1 = '<a target="_blank" href="(http://.*?)"'
alllink = re.compile(pat1).findall(data)
for i in range(0,len(alllink)):
try:
thislink = alllink[i]
thispage = urllib.request.urlopen(thislink).read().decode("utf-8","ignore")
pat2 = "<frame src=(.*?)>"
isframe = re.compile(pat2).findall(thispage)
if(len(isframe)==0):
urllib.request.urlretrieve(thislink,"D:/test/"+str(i)+".html")
else:
flink = isframe[0]
urllib.request.urlretrieve(thislink,"D:/test/"+str(i)+".html")
except thislink:
pass
二、糗事百科段子爬取:
import urllib.request
import re
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
fh = open("D:/test/123.txt","w")
for i in range(0,13):
thisurl = "https://www.qiushibaike.com/8hr/page/"+str(i+1)+"/"
data = urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
rst = re.compile(pat,re.S).findall(data)
print(len(rst))
for j in range(0,len(rst)):
print(rst[j])
print("-----------------------------------")
三、用户代理池的构建(使用用户代理爬取不同浏览器上的糗事百科):
import urllib.request
import re
import random #随机数模块,可是实现随机选择的功能。
uapools = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
]
def ua(uapools):
thisua = random.choice(uapools)
print(thisua)
headers = ("User-Agent",thisua)
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(0,13):
ua(uapools)
thisurl = "https://www.qiushibaike.com/8hr/page/"+str(i+1)+"/"
data = urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
rst = re.compile(pat,re.S).findall(data)
print(len(rst))
for j in range(0,len(rst)):
print(rst[j])
print("-----------------------------------")
四、IP代理
概述:IP代理指的是让爬虫使用代理IP取爬取对方的网站。
IP代理的构建实战:
import urllib.request
ip = "200.122.209.10:8080"
proxy = urllib.request.ProxyHandler({"http":ip})
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
url = "http://www.baidu.com"
data1 = urllib.request.urlopen(url).read()
data = data1.decode("utf-8","ignore")
print(len(data))
fh = open("D:/test/ip_baidu.html","wb")
fh.write(data1)
fh.close()
IP代理池构建的第一种方案(适合于代理IP稳定的情况)
import random
import urllib.request
ippools = [
"141.196.142.8:8080",
"119.40.106.69:8081",
"40.132.242.226:3128",
]
def ip(ippools):
thisip = random.choice(ippools)
print(thisip)
proxy = urllib.request.ProxyHandler({"http": thisip})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
for i in range(0,5):
try:
ip(ippools)
url = "http://www.baidu.com"
data1 = urllib.request.urlopen(url).read()
data = data1.decode("utf-8", "ignore")
print(len(data))
fh = open("D:/test/ip_baidu_"+str(i)+".html", "wb")
fh.write(data1)
fh.close()
except Exception as err:
print(err)
IP代理池构建的第二种方式(接口调用法,适合于代理IP不稳定的情况)
淘宝商品图片爬虫实战
import urllib.request
import re
import random
keyname = "连衣裙"
key = urllib.request.quote(keyname)
import urllib.request
uapools = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
]
def ua(uapools):
thisua = random.choice(uapools)
print(thisua)
headers = ("User-Agent",thisua)
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(1,101):
url = "https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
ua(uapools)
data = urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat = '"pic_url":"//(.*?)"'
imglist = re.compile(pat).findall(data)
for j in range(0,len(imglist)):
thisimg = imglist[j]
thisimgurl = "http://"+thisimg
localfile = "D:/test/test1/"+str(i)+str(j)+".jpg"
urllib.request.urlretrieve(thisimgurl,filename = localfile)
同时使用用户代理池和IP代理
#同时使用ip代理以及用户代理
import urllib.request
import random
#用户代理池和ip代理池
uapools = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; QIHU 360EE)"]
ippools = ["61.135.217.7","118.114.77.47","111.224.104.161"]
url = 'http://www.baidu.com'
#添加用户代理以及ip代理
def ua(uapools,ippools):
req = urllib.request.Request(url)
req.add_header("User-Agent",random.choice(uapools))
proxy = urllib.request.ProxyHandler({"https":random.choice(ippools)})
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
#将opener安装为全局
urllib.request.install_opener(opener)
return req
if __name__ == '__main__':
for i in range(20):
req = ua(uapools,ippools)
try:
data = urllib.request.urlopen(req)
print(len(data.read()))
print(data.getcode())
except Exception as err:
print(err)
微信爬虫实战
腾讯视频评论爬虫实战
import urllib.request
import re
vid = "1453179977"
cid = "6310753745936743097"
num = "20"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Content-Type":"application/javascript"
}
opener = urllib.request.build_opener()
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
opener.addheaders = headall
urllib.request.install_opener(opener)
for j in range(0,2):
print("page " + str(j))
thisurl = "https://coral.qq.com/article/"+vid+"/comment?commentid="+cid+"&reqnum="+num
data = urllib.request.urlopen(thisurl).read().decode("utf-8")
contentpat = '"content":"(.*?)"'
contentall = re.compile(contentpat, re.S).findall(data)
lastpat = '"last":"(.*?)"'
cid = re.compile(lastpat, re.S).findall(data)[0]
for i in range(0, len(contentall)):
try:
print("conent:" + eval('u"'+contentall[i]+'"'))
print("----------")
except Exception as err:
print(err)
爬取电影天堂的电影链接
import urllib.request
import re
import random
uapools = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
]
def ua(uapools):
thisua = random.choice(uapools)
headers = ("User-Agent",thisua)
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
fh = open("D:/test1/test.txt","w")
for i in range (0, 14):
ua(uapools)
url = "http://www.dytt8.net/html/tv/oumeitv/list_9_"+str(i)+".html"
data = urllib.request.urlopen(url).read().decode("gb2312", "ignore")
pat = '<b>.*?<a.*?href="(.*?)".*?class="ulink">'
rst = re.compile(pat, re.S).findall(data)
for j in range(0, len(rst)):
try:
thisurl = "http://www.dytt8.net"+rst[j]
thisdata = urllib.request.urlopen(thisurl).read().decode("gb2312", "ignore")
thispat = '"#fdfddf"><a href="(.*?)">'
thisrst = re.compile(thispat, re.S).findall(thisdata)
for z in range(0,len(thisrst)):
fh.write(str(thisrst[z])+"\n--------------------------------------------------------\n")
#print(thisrst)
except Exception as err:
print(err)
fh.close()
电影天堂(全部)
import urllib.request
import re
import random
uapools = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
]
def ua(uapools):
thisua = random.choice(uapools)
headers = ("User-Agent",thisua)
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
fh = open("D:/test1/dianying.txt","w")
list = ["gndy/dyzz","gndy/jddy","tv/rihantv","zongyi2013","2009zongyi","dongman","tv/hytv","gndy/rihan"]
list2 = ["23","63","8","99","89","16","71","6"]
for this in range(0, 8):
for i in range(1, 100):
ua(uapools)
if(this<6):
url = "http://www.ygdy8.net/html/" + str(list[this]) + "/list_" + str(list2[this]) + "_" + str(i) + ".html"
else:
url = "http://www.dytt8.net/html/" + str(list[this]) + "/list_" + str(list2[this]) + "_" + str(i) + ".html"
data = urllib.request.urlopen(url).read().decode("gb2312", "ignore")
#print(data)
pat = '<b>.*?<a.*?href="(.*?)".*?class="ulink">'
rst = re.compile(pat, re.S).findall(data)
#print(rst)
for j in range(0, len(rst)):
try:
if(this<6):
thisurl = "http://www.ygdy8.net" + rst[j]
else:
thisurl = "http://www.dytt8.net" + rst[j]
thisdata = urllib.request.urlopen(thisurl).read().decode("gb2312", "ignore")
#print(len(thisdata))
thispat = '"#fdfddf"><a href="(.*?)">'
thisrst = re.compile(thispat, re.S).findall(thisdata)
for z in range(0, len(thisrst)):
fh.write(str(thisrst[z])+"\n")
print(len(thisrst))
except Exception as err:
print(err)
fh.close()