python自动爬取别致数据并保存图片

2016-11-09  本文已影响86人  代码没写完休想上厕所
#!/usr/bin/env python
# -*- coding: utf-8 -*- 
import urllib2
import urllib
import xlrd
import xlwt
import xlutils
from xlutils.copy import copy
import sys
reload(sys)
sys.setdefaultencoding('utf8')

'''
Created on 11 9, 2016

@author: xwang

脚本使用说明:
1.同级目录建立“biezhidb.xls”
2.图片保存至/Users/xunwang/Desktop/别致爬虫/pic/
'''

baseUrl = "http://chocolateback.sinaapp.com/"
currentRow = 0;

def testXlrd(filename):
    book = xlrd.open_workbook(filename)
    sh = book.sheet_by_index(0)
    #print sh.nrows, sh.ncols
    # rows = sh.row_values(2)
    return sh.nrows

def testXlwt(filename, index, data):
    book = xlrd.open_workbook(filename)
    sh = book.sheet_by_index(0)
    wsh = copy(book)
    wsh2 = wsh.get_sheet(0)
    wsh2.write(currentRow, index, data)
    wsh.save(filename)

def pachong():
    global currentRow
    for i in range(10000,14000):
        try:
            url = baseUrl + str(i)
            print url
            up = urllib2.urlopen(url)
            cont = up.read()

            # url
            head = '<a target="_blank" href="'
            tail = '">'
            ph = cont.find(head)
            pj = cont.find(tail, ph + 1)
            url = unicode(cont[ph + len(head) : pj].strip(), "utf-8")
            if url[7:15] == "redirect":
                continue
            testXlwt('biezhidb.xls', 2, url)
            # print cont[ph + len(head) : pj].strip()

            # print cont
            testXlwt('biezhidb.xls', 0, str(i))
            # title
            head = '<span class="title">'
            tail = '</span>'
            ph = cont.find(head)
            pj = cont.find(tail, ph + 1)
            title = unicode(cont[ph + len(head) : pj].strip(), "utf-8")
            print title
            testXlwt('biezhidb.xls', 1, title)
            #print cont[ph + len(head) : pj].strip()
            # price
            head = '<span class="price">'
            tail = '</span>'
            ph = cont.find(head)
            pj = cont.find(tail, ph + 1)
            price = unicode(cont[ph + len(head) : pj].strip(), "utf-8")
            testXlwt('biezhidb.xls', 3, price)
            # print cont[ph + len(head) : pj].strip()
            # picUrl
            head = '<img src="h'
            tail = '">'
            ph = cont.find(head)
            pj = cont.find(tail, ph + 1)
            picUrl = unicode("h" + cont[ph + len(head) : pj].strip(), "utf-8")
            testXlwt('biezhidb.xls', 4, picUrl)
            storePic(picUrl, i)
            #print "h" + cont[ph + len(head) : pj].strip()
            currentRow = currentRow + 1
        except:
            continue

def storePic(url, id):
    urllib.urlretrieve(url,'/Users/xunwang/Desktop/别致爬虫/pic2/biezhi_%s.jpg' % (id))

if __name__=='__main__':
    currentRow = testXlrd('biezhidb.xls')
    print currentRow
    pachong()
    print "写入完毕!"
上一篇下一篇

猜你喜欢

热点阅读