Python爬虫(Python3.6)

2018-05-05  本文已影响0人  Ucan先生
import urllib.request
import urllib.error
import os
import re
import imageio
capterId = 5301
sectionId = 1
dir = 'C:/Users/zybang/Desktop/gaoshu'
url = "http://netedu.xauat.edu.cn/jpkc/netedu/jpkc/gdsx/homepage/5jxsd/51/513/"
pattern = re.compile('<img.*?src="(.*?/.*?.gif)"')
while capterId < 5313:
    url = url + str(capterId)+"/"
    while sectionId < 20:
        if sectionId<10:
            strSectionId = str(capterId)+str(0)+str(sectionId)
        else:
            strSectionId = str(capterId)+str(sectionId)
        requestUrl = url+strSectionId+'.htm'
        try:
            response = urllib.request.urlopen(requestUrl)
        except urllib.error.HTTPError as e :
            print(requestUrl)
            print(e.code)
            continue
        data = response.read()
        data1 = data.decode('gbk')
        data2 = str(data)
        titlePattern = re.compile('<title>(.*?)</title>')
        images = pattern.findall(data2)
        title = titlePattern.findall(data1)
        title1 = title[0]
        f = open(dir+'/'+title1+'.htm','wb')
        f.write(data)
        for image in images:
            imageUrl = url+image
            try:
                imgResponse = urllib.request.urlopen(imageUrl)
            except urllib.error.URLError as e :
                print(imageUrl)
                print(e.reason)
                continue
            imgBytes = imgResponse.read()
            pathpatt = re.compile('/')
            path = pathpatt.split(image)
            imgDir = dir+"/"+path[0]
            if not os.path.exists(imgDir):
                os.makedirs(imgDir)
            imgFile = open(dir+"/"+image,"wb")
            imgFile.write(imgBytes)
            sectionId += 1
    capterId +=1

上一篇下一篇

猜你喜欢

热点阅读