python爬虫

2017-11-09 本文已影响0人 ZHQIAN

一、新闻爬虫实战（爬取新浪新闻首页所有新闻内容）
思路：
1、爬取新闻首页
2、得到各新闻链接
3、爬取新闻链接
4、寻找有没有frame
5、若有，抓取frame下对应的网页内容
6、若没有，直接抓取当前页面
例子程序：

import urllib.request
import re
url = "http://news.sina.com.cn/"
data = urllib.request.urlopen(url).read().decode("UTF-8","ignore")
pat1 = '<a target="_blank" href="(http://.*?)"'
alllink = re.compile(pat1).findall(data)
for i in range(0,len(alllink)):
    try:
        thislink = alllink[i]
        thispage = urllib.request.urlopen(thislink).read().decode("utf-8","ignore")
        pat2 = "<frame src=(.*?)>"
        isframe = re.compile(pat2).findall(thispage)
        if(len(isframe)==0):
            urllib.request.urlretrieve(thislink,"D:/test/"+str(i)+".html")
        else:
            flink = isframe[0]
            urllib.request.urlretrieve(thislink,"D:/test/"+str(i)+".html")
    except thislink:
        pass

二、糗事百科段子爬取：

import urllib.request
import re
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
fh = open("D:/test/123.txt","w")
for i in range(0,13):
    thisurl = "https://www.qiushibaike.com/8hr/page/"+str(i+1)+"/"
    data = urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
    rst = re.compile(pat,re.S).findall(data)
    print(len(rst))
    for j in range(0,len(rst)):
        print(rst[j])
        print("-----------------------------------")

三、用户代理池的构建(使用用户代理爬取不同浏览器上的糗事百科)：

import urllib.request
import re
import random   #随机数模块，可是实现随机选择的功能。
uapools = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
    ]
def ua(uapools):
    thisua = random.choice(uapools)
    print(thisua)
    headers = ("User-Agent",thisua)
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
for i in range(0,13):
    ua(uapools)
    thisurl = "https://www.qiushibaike.com/8hr/page/"+str(i+1)+"/"
    data = urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
    rst = re.compile(pat,re.S).findall(data)
    print(len(rst))
    for j in range(0,len(rst)):
        print(rst[j])
        print("-----------------------------------")

四、IP代理
概述：IP代理指的是让爬虫使用代理IP取爬取对方的网站。
IP代理的构建实战：

import urllib.request
ip = "200.122.209.10:8080"
proxy = urllib.request.ProxyHandler({"http":ip})
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
url = "http://www.baidu.com"
data1 = urllib.request.urlopen(url).read()
data = data1.decode("utf-8","ignore")
print(len(data))
fh = open("D:/test/ip_baidu.html","wb")
fh.write(data1)
fh.close()

IP代理池构建的第一种方案（适合于代理IP稳定的情况）

import random
import urllib.request
ippools = [
    "141.196.142.8:8080",
    "119.40.106.69:8081",
    "40.132.242.226:3128",
]
def ip(ippools):
    thisip = random.choice(ippools)
    print(thisip)
    proxy = urllib.request.ProxyHandler({"http": thisip})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
for i in range(0,5):
    try:
        ip(ippools)
        url = "http://www.baidu.com"
        data1 = urllib.request.urlopen(url).read()
        data = data1.decode("utf-8", "ignore")
        print(len(data))
        fh = open("D:/test/ip_baidu_"+str(i)+".html", "wb")
        fh.write(data1)
        fh.close()
    except Exception as err:
        print(err)

IP代理池构建的第二种方式（接口调用法，适合于代理IP不稳定的情况）

淘宝商品图片爬虫实战

import urllib.request
import re
import random
keyname = "连衣裙"
key = urllib.request.quote(keyname)
import urllib.request
uapools = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
    ]
def ua(uapools):
    thisua = random.choice(uapools)
    print(thisua)
    headers = ("User-Agent",thisua)
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
for i in range(1,101):
    url = "https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
    ua(uapools)
    data = urllib.request.urlopen(url).read().decode("utf-8","ignore")
    pat = '"pic_url":"//(.*?)"'
    imglist = re.compile(pat).findall(data)
    for j in range(0,len(imglist)):
        thisimg = imglist[j]
        thisimgurl = "http://"+thisimg
        localfile = "D:/test/test1/"+str(i)+str(j)+".jpg"
        urllib.request.urlretrieve(thisimgurl,filename = localfile)

同时使用用户代理池和IP代理

#同时使用ip代理以及用户代理
import urllib.request
import random
#用户代理池和ip代理池
uapools = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ;  QIHU 360EE)"]
ippools = ["61.135.217.7","118.114.77.47","111.224.104.161"]
url = 'http://www.baidu.com'
#添加用户代理以及ip代理
def ua(uapools,ippools):
    req = urllib.request.Request(url)
    req.add_header("User-Agent",random.choice(uapools))
    proxy = urllib.request.ProxyHandler({"https":random.choice(ippools)})
    opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    #将opener安装为全局
    urllib.request.install_opener(opener)
    return req
if __name__ == '__main__':
    for i in range(20):
        req = ua(uapools,ippools)
        try:
            data = urllib.request.urlopen(req)
            print(len(data.read()))
            print(data.getcode())
        except Exception as err:
            print(err)

微信爬虫实战

腾讯视频评论爬虫实战

import urllib.request
import re
vid = "1453179977"
cid = "6310753745936743097"
num = "20"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
         "Content-Type":"application/javascript"
}
opener = urllib.request.build_opener()
headall = []
for key, value in headers.items():
    item = (key, value)
    headall.append(item)
opener.addheaders = headall
urllib.request.install_opener(opener)
for j in range(0,2):
    print("page " + str(j))
    thisurl = "https://coral.qq.com/article/"+vid+"/comment?commentid="+cid+"&reqnum="+num
    data = urllib.request.urlopen(thisurl).read().decode("utf-8")
    contentpat = '"content":"(.*?)"'
    contentall = re.compile(contentpat, re.S).findall(data)
    lastpat = '"last":"(.*?)"'
    cid = re.compile(lastpat, re.S).findall(data)[0]
    for i in range(0, len(contentall)):
        try:
            print("conent:" + eval('u"'+contentall[i]+'"'))
            print("----------")
        except Exception as err:
            print(err)

爬取电影天堂的电影链接

import urllib.request
import re
import random
uapools = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
    ]
def ua(uapools):
    thisua = random.choice(uapools)
    headers = ("User-Agent",thisua)
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
fh = open("D:/test1/test.txt","w")
for i in range (0, 14):
    ua(uapools)
    url = "http://www.dytt8.net/html/tv/oumeitv/list_9_"+str(i)+".html"
    data = urllib.request.urlopen(url).read().decode("gb2312", "ignore")
    pat = '<b>.*?<a.*?href="(.*?)".*?class="ulink">'
    rst = re.compile(pat, re.S).findall(data)
    for j in range(0, len(rst)):
        try:
            thisurl = "http://www.dytt8.net"+rst[j]
            thisdata = urllib.request.urlopen(thisurl).read().decode("gb2312", "ignore")
            thispat = '"#fdfddf"><a href="(.*?)">'
            thisrst = re.compile(thispat,  re.S).findall(thisdata)
            for z in range(0,len(thisrst)):
                fh.write(str(thisrst[z])+"\n--------------------------------------------------------\n")
            #print(thisrst)
        except Exception as err:
            print(err)
fh.close()

电影天堂（全部）

import urllib.request
import re
import random
uapools = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
    ]
def ua(uapools):
    thisua = random.choice(uapools)
    headers = ("User-Agent",thisua)
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
fh = open("D:/test1/dianying.txt","w")
list = ["gndy/dyzz","gndy/jddy","tv/rihantv","zongyi2013","2009zongyi","dongman","tv/hytv","gndy/rihan"]
list2 = ["23","63","8","99","89","16","71","6"]
for this in range(0, 8):
    for i in range(1, 100):
        ua(uapools)
        if(this<6):
            url = "http://www.ygdy8.net/html/" + str(list[this]) + "/list_" + str(list2[this]) + "_" + str(i) + ".html"
        else:
            url = "http://www.dytt8.net/html/" + str(list[this]) + "/list_" + str(list2[this]) + "_" + str(i) + ".html"
        data = urllib.request.urlopen(url).read().decode("gb2312", "ignore")
        #print(data)
        pat = '<b>.*?<a.*?href="(.*?)".*?class="ulink">'
        rst = re.compile(pat, re.S).findall(data)
        #print(rst)

        for j in range(0, len(rst)):
            try:
                if(this<6):
                    thisurl = "http://www.ygdy8.net" + rst[j]
                else:
                    thisurl = "http://www.dytt8.net" + rst[j]
                thisdata = urllib.request.urlopen(thisurl).read().decode("gb2312", "ignore")
                #print(len(thisdata))
                thispat = '"#fdfddf"><a href="(.*?)">'
                thisrst = re.compile(thispat, re.S).findall(thisdata)
                for z in range(0, len(thisrst)):
                    fh.write(str(thisrst[z])+"\n")
                    print(len(thisrst))
            except Exception as err:
                print(err)
fh.close()

python爬虫

猜你喜欢

热点阅读