循环获取链接
2018-10-09 本文已影响0人
IthinkIcanfly
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
import random
import datetime
import re
pages = set()
random.seed(datetime.datetime.now())
def getInternalLinks(bsObject,includeUrl):
internalLinks = []
for link in bsObject.findAll('a',href = re.compile('^(/|.{0,2}' + includeUrl + ')')):
if link.attrs['href'] is not None:
href = 'http://' + re.sub('^(.|/)+','',link.attrs['href'])
if href not in internalLinks:
internalLinks.append(href)
return(internalLinks)
def splitAddress(address):
addressParts = address.replace('http://','').split('/')
return(addressParts)
def getExternalLinks(bsObject,excludeUrl):
externalLinks = []
for link in bsObject.findAll('a',{'href': re.compile('^(http)((?!' + excludeUrl + ').)*$')}):
if link.attrs['href'] is not None:
href = link.attrs['href']
if href not in externalLinks:
externalLinks.append(href)
return(externalLinks)
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObject = BeautifulSoup(html,'html.parser')
externalLinks = getExternalLinks(bsObject,splitAddress(startingPage)[0])
if len(externalLinks) == 0:
internalLinks = getInternalLinks(bsObject,splitAddress(startingPage)[0])
return(getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks) - 1)]))
else:
return(externalLinks[random.randint(0,len(externalLinks) - 1)])
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print('随机外链是: ' + externalLink)
try:
followExternalOnly(externalLink)
except (HTTPError,ValueError,URLError):
followExternalOnly(startingSite)
followExternalOnly('http://www.baidu.com/')