Python爬虫学习(3)爬取随机外链
2017-06-19 本文已影响0人
语落心生
在前两张前,我们所进行的行为是基于一个页面的html结构进行解析,但在实际的网络爬虫中,会顺着一个链接跳转到另一个链接,构建出一张"网络地图",所以我们本次将对外链进行爬取
示例:http://oreilly.com
测试一下是否能拿到外链
from urllib.parse import urlparse
import random
import datetime
import re
pages = set()
random.seed(datetime.datetime.now())
#获取页面内部链接
def getInternalLinks(bsObj,includeUrl):
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
internalLinks = []
for link in bsObj.findAll("a",href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if(link.href['href'].startswith("/")):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
def followExtrenalOnly(startingPage):
externalLink = "https://en.wikipedia.org/wiki/Intelligence_agency"
print("Random extranal link is"+externalLink)
followExtrenalOnly(externalLink)
# def main():
# followExtrenalOnly("http://en.wikipedia.org")
# print('End')
# if __name__ == '__main__':
# main()
followExtrenalOnly("http://en.wikipedia.org")
console output
递归迭代外链数,一共56条
90890890.png
在网站首页不保证一定能发现外链,根据第二章的console output实验我们可以知道,html结构不存在外链的情况
对比https://en.wikipedia.org/wiki/Main_Page与https://en.wikipedia.org/wiki/Auriscalpium_vulgare的html结构如下
寻找该页面外链的dfs逻辑如下:
当获取页面上的所有外链时,我们按照递归的方式去找,当遇到一个外链,视为达到一个叶子结点。若为遇到,修改此外链为内链,结束本次递归,回溯从主页面开始搜索
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import random
import datetime
import re
pages = set()
random.seed(datetime.datetime.now())
#获取页面内部链接
def getInternalLinks(bsObj,includeUrl):
includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
internalLinks = []
for link in bsObj.findAll("a",href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if(link.href['href'].startswith("/")):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
def getExtrenalLinks(bsObj,excludeurl):
extrenalLinks=[]
#查找http开头和www开头的域名
for link in bsObj.findAll("a",href =re.compile("^(http|www)((?!"+excludeurl+").)*$")):
if link.attrs['href'] is not None:
#如果内连接包含跳转到其他页面的链接
if link.attrs['href'] not in extrenalLinks:
extrenalLinks.append(link.attrs['href'])
return extrenalLinks
def getRandomExtrnalLink(startingPage):
html=urlopen(startingPage)
bsObj= BeautifulSoup(html,"html.parser")
extrenalLinks = getExtrenalLinks(bsObj,urlparse(startingPage).netloc)
if len(extrenalLinks)==0:
print("没有找到外链")
domain =urlparse(html).scheme+"://"+urlparse(startingPage).netloc
internalLinks=getInternalLinks(bsObj,domain)
return getRandomExtrnalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return extrenalLinks[random.randint(0,len(extrenalLinks)-1)]
def followExtrenalOnly(startingPage):
externalLink =getRandomExtrnalLink(startingPage)
#externalLink = "https://en.wikipedia.org/wiki/Intelligence_agency"
print("Random extranal link is"+externalLink)
followExtrenalOnly(externalLink)
# def main():
# followExtrenalOnly("http://en.wikipedia.org")
# print('End')
# if __name__ == '__main__':
# main()
followExtrenalOnly("https://en.wikipedia.org/wiki/Main_Page")
console output
9789789.pngTips: 根据随机外链,各位朋友可以参考一下时下最为流行的区块链:
简单易懂的区块链: http://python.jobbole.com/88248/
阮一峰老师的区块链入门: http://www.ruanyifeng.com/blog/2017/12/blockchain-tutorial.html