Python爬虫(Python3.6)
2018-05-05 本文已影响0人
Ucan先生
import urllib.request
import urllib.error
import os
import re
import imageio
capterId = 5301
sectionId = 1
dir = 'C:/Users/zybang/Desktop/gaoshu'
url = "http://netedu.xauat.edu.cn/jpkc/netedu/jpkc/gdsx/homepage/5jxsd/51/513/"
pattern = re.compile('<img.*?src="(.*?/.*?.gif)"')
while capterId < 5313:
url = url + str(capterId)+"/"
while sectionId < 20:
if sectionId<10:
strSectionId = str(capterId)+str(0)+str(sectionId)
else:
strSectionId = str(capterId)+str(sectionId)
requestUrl = url+strSectionId+'.htm'
try:
response = urllib.request.urlopen(requestUrl)
except urllib.error.HTTPError as e :
print(requestUrl)
print(e.code)
continue
data = response.read()
data1 = data.decode('gbk')
data2 = str(data)
titlePattern = re.compile('<title>(.*?)</title>')
images = pattern.findall(data2)
title = titlePattern.findall(data1)
title1 = title[0]
f = open(dir+'/'+title1+'.htm','wb')
f.write(data)
for image in images:
imageUrl = url+image
try:
imgResponse = urllib.request.urlopen(imageUrl)
except urllib.error.URLError as e :
print(imageUrl)
print(e.reason)
continue
imgBytes = imgResponse.read()
pathpatt = re.compile('/')
path = pathpatt.split(image)
imgDir = dir+"/"+path[0]
if not os.path.exists(imgDir):
os.makedirs(imgDir)
imgFile = open(dir+"/"+image,"wb")
imgFile.write(imgBytes)
sectionId += 1
capterId +=1