Go! Go! Python

Python爬虫学习(3)爬取随机外链

2017-06-19  本文已影响0人  语落心生

在前两张前,我们所进行的行为是基于一个页面的html结构进行解析,但在实际的网络爬虫中,会顺着一个链接跳转到另一个链接,构建出一张"网络地图",所以我们本次将对外链进行爬取
示例:http://oreilly.com

测试一下是否能拿到外链

from urllib.parse import urlparse
import random
import datetime
import re
pages = set()
random.seed(datetime.datetime.now())

#获取页面内部链接
def getInternalLinks(bsObj,includeUrl):
    includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
    internalLinks = []
    for link in bsObj.findAll("a",href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.href['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

def followExtrenalOnly(startingPage):
    externalLink = "https://en.wikipedia.org/wiki/Intelligence_agency"
    print("Random extranal link is"+externalLink)
    followExtrenalOnly(externalLink)

# def main():
#     followExtrenalOnly("http://en.wikipedia.org")
#     print('End')
#     if __name__ == '__main__':
#         main()
followExtrenalOnly("http://en.wikipedia.org")

console output
递归迭代外链数,一共56条


90890890.png

在网站首页不保证一定能发现外链,根据第二章的console output实验我们可以知道,html结构不存在外链的情况
对比https://en.wikipedia.org/wiki/Main_Pagehttps://en.wikipedia.org/wiki/Auriscalpium_vulgare的html结构如下

87878768.png 4545545.png

寻找该页面外链的dfs逻辑如下:
当获取页面上的所有外链时,我们按照递归的方式去找,当遇到一个外链,视为达到一个叶子结点。若为遇到,修改此外链为内链,结束本次递归,回溯从主页面开始搜索

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import random
import datetime
import re
pages = set()
random.seed(datetime.datetime.now())
#获取页面内部链接
def getInternalLinks(bsObj,includeUrl):
    includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
    internalLinks = []
    for link in bsObj.findAll("a",href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.href['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

def getExtrenalLinks(bsObj,excludeurl):
    extrenalLinks=[]
    #查找http开头和www开头的域名
    for link in bsObj.findAll("a",href =re.compile("^(http|www)((?!"+excludeurl+").)*$")):
        if link.attrs['href'] is not None:
            #如果内连接包含跳转到其他页面的链接
            if link.attrs['href'] not in extrenalLinks:
                    extrenalLinks.append(link.attrs['href'])
    return extrenalLinks

def getRandomExtrnalLink(startingPage):
    html=urlopen(startingPage)
    bsObj= BeautifulSoup(html,"html.parser")
    extrenalLinks = getExtrenalLinks(bsObj,urlparse(startingPage).netloc)
    if len(extrenalLinks)==0:
        print("没有找到外链")
        domain =urlparse(html).scheme+"://"+urlparse(startingPage).netloc
        internalLinks=getInternalLinks(bsObj,domain)
        return getRandomExtrnalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return  extrenalLinks[random.randint(0,len(extrenalLinks)-1)]

def followExtrenalOnly(startingPage):
    externalLink =getRandomExtrnalLink(startingPage)
    #externalLink = "https://en.wikipedia.org/wiki/Intelligence_agency"
    print("Random extranal link is"+externalLink)
    followExtrenalOnly(externalLink)

# def main():
#     followExtrenalOnly("http://en.wikipedia.org")
#     print('End')
#     if __name__ == '__main__':
#         main()
followExtrenalOnly("https://en.wikipedia.org/wiki/Main_Page")

console output

9789789.png

Tips: 根据随机外链,各位朋友可以参考一下时下最为流行的区块链:

简单易懂的区块链: http://python.jobbole.com/88248/
阮一峰老师的区块链入门: http://www.ruanyifeng.com/blog/2017/12/blockchain-tutorial.html

上一篇下一篇

猜你喜欢

热点阅读